# -*- coding:utf-8 -*- # from spire.doc import * # from spire.doc.common import * # # # 创建一个 Document 对象 # document = Document() # # 加载一个 Word DOCX 文档 # # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc") # document.LoadFromFile("D:\\数据集\\数据集\\3.doc") # print(document.Sections.Count) # for i in range(document.Sections.Count): # section=document.Sections[i] # for x in range(section.Paragraphs.Count): # paragraph=section.Paragraphs[x] # print(paragraph.Text) # print("---------------------------------") # # 或加载一个 Word DOC 文档 # # document.LoadFromFile("1223.xml") # # # # # 设置是否在 HTML 中嵌入图片 # # document.HtmlExportOptions.ImageEmbedded = True # # # document.XHTMLValidateOption.ImageEmbedded = True # # # # # # # 设置是否将表单字段导出为纯文本在 HTML 中显示 # # document.HtmlExportOptions.IsTextInputFormFieldAsText = True # # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True # # # # # # # 设置是否在 HTML 中导出页眉和页脚 # # document.HtmlExportOptions.HasHeadersFooters = False # # # document.XHTMLValidateOption.HasHeadersFooters = True # # # # # 将 Word 文档保存为 HTML 文件 # # document.SaveToFile("1223.html", FileFormat.Html) # # # # document.Close() from bs4 import BeautifulSoup # 读取HTML文件 with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file: html_content = file.read() # 解析HTML文档 soup = BeautifulSoup(html_content, 'html.parser') # 用于存储结果的字典 headings = {} current_heading = None # 遍历所有的h1, h2, h3等标题 for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]): level = int(element.name[1]) # 获取标题级别 title = element.get_text(strip=True) # 获取标题文本 # 设置当前标题 current_heading = { 'title': title, 'level': level, 'content': [] } # 将当前标题添加到字典中 headings[title] = current_heading # 寻找当前标题下的内容 next_element = element.find_next_sibling() while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]: # 判断内容的标签 if next_element.name in ['p', 'div']: current_heading['content'].append(next_element.get_text(strip=False)) next_element = next_element.find_next_sibling() # 输出结果 for heading in headings.values(): print(f"标题: {heading['title']} (级别: {heading['level']})") print("内容:") for content in heading['content']: print(f" - {content}") print()