You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

79 lines
2.6 KiB

# -*- coding:utf-8 -*-
# from spire.doc import *
# from spire.doc.common import *
#
# # 创建一个 Document 对象
# document = Document()
# # 加载一个 Word DOCX 文档
# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc")
# document.LoadFromFile("D:\\数据集\\数据集\\3.doc")
# print(document.Sections.Count)
# for i in range(document.Sections.Count):
# section=document.Sections[i]
# for x in range(section.Paragraphs.Count):
# paragraph=section.Paragraphs[x]
# print(paragraph.Text)
# print("---------------------------------")
# # 或加载一个 Word DOC 文档
# # document.LoadFromFile("1223.xml")
#
# # # # 设置是否在 HTML 中嵌入图片
# # document.HtmlExportOptions.ImageEmbedded = True
# # # document.XHTMLValidateOption.ImageEmbedded = True
# # #
# # # # 设置是否将表单字段导出为纯文本在 HTML 中显示
# # document.HtmlExportOptions.IsTextInputFormFieldAsText = True
# # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True
# # #
# # # # 设置是否在 HTML 中导出页眉和页脚
# # document.HtmlExportOptions.HasHeadersFooters = False
# # # document.XHTMLValidateOption.HasHeadersFooters = True
# #
# # # 将 Word 文档保存为 HTML 文件
# # document.SaveToFile("1223.html", FileFormat.Html)
# # #
# document.Close()
from bs4 import BeautifulSoup
# 读取HTML文件
with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file:
html_content = file.read()
# 解析HTML文档
soup = BeautifulSoup(html_content, 'html.parser')
# 用于存储结果的字典
headings = {}
current_heading = None
# 遍历所有的h1, h2, h3等标题
for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]):
level = int(element.name[1]) # 获取标题级别
title = element.get_text(strip=True) # 获取标题文本
# 设置当前标题
current_heading = {
'title': title,
'level': level,
'content': []
}
# 将当前标题添加到字典中
headings[title] = current_heading
# 寻找当前标题下的内容
next_element = element.find_next_sibling()
while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]:
# 判断内容的标签
if next_element.name in ['p', 'div']:
current_heading['content'].append(next_element.get_text(strip=False))
next_element = next_element.find_next_sibling()
# 输出结果
for heading in headings.values():
print(f"标题: {heading['title']} (级别: {heading['level']})")
print("内容:")
for content in heading['content']:
print(f" - {content}")
print()