python_ai/test.py


								# -*- coding:utf-8 -*-

								# from spire.doc import *

								# from spire.doc.common import *

								#

								# # 创建一个 Document 对象

								# document = Document()

								# # 加载一个 Word DOCX 文档

								# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc")

								# document.LoadFromFile("D:\\数据集\\数据集\\3.doc")

								# print(document.Sections.Count)

								# for i in range(document.Sections.Count):

								#     section=document.Sections[i]

								#     for x  in range(section.Paragraphs.Count):

								#         paragraph=section.Paragraphs[x]

								#         print(paragraph.Text)

								#     print("---------------------------------")

								#     # 或加载一个 Word DOC 文档

								# # document.LoadFromFile("1223.xml")

								#

								# # # # 设置是否在 HTML 中嵌入图片

								# # document.HtmlExportOptions.ImageEmbedded = True

								# # # document.XHTMLValidateOption.ImageEmbedded = True

								# # #

								# # # # 设置是否将表单字段导出为纯文本在 HTML 中显示

								# # document.HtmlExportOptions.IsTextInputFormFieldAsText = True

								# # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True

								# # #

								# # # # 设置是否在 HTML 中导出页眉和页脚

								# # document.HtmlExportOptions.HasHeadersFooters = False

								# # # document.XHTMLValidateOption.HasHeadersFooters = True

								# #

								# # # 将 Word 文档保存为 HTML 文件

								# # document.SaveToFile("1223.html", FileFormat.Html)

								# # #

								# document.Close()

								from bs4 import BeautifulSoup

								# 读取HTML文件

								with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file:

								    html_content = file.read()


								# 解析HTML文档

								soup = BeautifulSoup(html_content, 'html.parser')


								# 用于存储结果的字典

								headings = {}

								current_heading = None


								# 遍历所有的h1, h2, h3等标题

								for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]):

								    level = int(element.name[1])  # 获取标题级别

								    title = element.get_text(strip=True)  # 获取标题文本


								    # 设置当前标题

								    current_heading = {

								        'title': title,

								        'level': level,

								        'content': []

								    }


								    # 将当前标题添加到字典中

								    headings[title] = current_heading


								    # 寻找当前标题下的内容

								    next_element = element.find_next_sibling()

								    while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]:

								        # 判断内容的标签

								        if next_element.name in ['p', 'div']:

								            current_heading['content'].append(next_element.get_text(strip=False))

								        next_element = next_element.find_next_sibling()


								# 输出结果

								for heading in headings.values():

								    print(f"标题: {heading['title']} (级别: {heading['level']})")

								    print("内容:")

								    for content in heading['content']:

								        print(f" - {content}")

								    print()