# -*- coding:utf-8 -*-
# from spire.doc import *
# from spire.doc.common import *
#
# # 创建一个 Document 对象
# document = Document()
# # 加载一个 Word DOCX 文档
# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc")
# document.LoadFromFile("D:\\数据集\\数据集\\3.doc")
# print(document.Sections.Count)
# for i in range(document.Sections.Count):
#     section=document.Sections[i]
#     for x  in range(section.Paragraphs.Count):
#         paragraph=section.Paragraphs[x]
#         print(paragraph.Text)
#     print("---------------------------------")
#     # 或加载一个 Word DOC 文档
# # document.LoadFromFile("1223.xml")
#
# # # # 设置是否在 HTML 中嵌入图片
# # document.HtmlExportOptions.ImageEmbedded = True
# # # document.XHTMLValidateOption.ImageEmbedded = True
# # #
# # # # 设置是否将表单字段导出为纯文本在 HTML 中显示
# # document.HtmlExportOptions.IsTextInputFormFieldAsText = True
# # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True
# # #
# # # # 设置是否在 HTML 中导出页眉和页脚
# # document.HtmlExportOptions.HasHeadersFooters = False
# # # document.XHTMLValidateOption.HasHeadersFooters = True
# #
# # # 将 Word 文档保存为 HTML 文件
# # document.SaveToFile("1223.html", FileFormat.Html)
# # #
# document.Close()
from bs4 import BeautifulSoup
# 读取HTML文件
with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file:
    html_content = file.read()

# 解析HTML文档
soup = BeautifulSoup(html_content, 'html.parser')

# 用于存储结果的字典
headings = {}
current_heading = None

# 遍历所有的h1, h2, h3等标题
for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]):
    level = int(element.name[1])  # 获取标题级别
    title = element.get_text(strip=True)  # 获取标题文本

    # 设置当前标题
    current_heading = {
        'title': title,
        'level': level,
        'content': []
    }

    # 将当前标题添加到字典中
    headings[title] = current_heading

    # 寻找当前标题下的内容
    next_element = element.find_next_sibling()
    while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]:
        # 判断内容的标签
        if next_element.name in ['p', 'div']:
            current_heading['content'].append(next_element.get_text(strip=False))
        next_element = next_element.find_next_sibling()

# 输出结果
for heading in headings.values():
    print(f"标题: {heading['title']} (级别: {heading['level']})")
    print("内容:")
    for content in heading['content']:
        print(f" - {content}")
    print()