You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
79 lines
2.6 KiB
79 lines
2.6 KiB
# -*- coding:utf-8 -*-
|
|
# from spire.doc import *
|
|
# from spire.doc.common import *
|
|
#
|
|
# # 创建一个 Document 对象
|
|
# document = Document()
|
|
# # 加载一个 Word DOCX 文档
|
|
# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc")
|
|
# document.LoadFromFile("D:\\数据集\\数据集\\3.doc")
|
|
# print(document.Sections.Count)
|
|
# for i in range(document.Sections.Count):
|
|
# section=document.Sections[i]
|
|
# for x in range(section.Paragraphs.Count):
|
|
# paragraph=section.Paragraphs[x]
|
|
# print(paragraph.Text)
|
|
# print("---------------------------------")
|
|
# # 或加载一个 Word DOC 文档
|
|
# # document.LoadFromFile("1223.xml")
|
|
#
|
|
# # # # 设置是否在 HTML 中嵌入图片
|
|
# # document.HtmlExportOptions.ImageEmbedded = True
|
|
# # # document.XHTMLValidateOption.ImageEmbedded = True
|
|
# # #
|
|
# # # # 设置是否将表单字段导出为纯文本在 HTML 中显示
|
|
# # document.HtmlExportOptions.IsTextInputFormFieldAsText = True
|
|
# # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True
|
|
# # #
|
|
# # # # 设置是否在 HTML 中导出页眉和页脚
|
|
# # document.HtmlExportOptions.HasHeadersFooters = False
|
|
# # # document.XHTMLValidateOption.HasHeadersFooters = True
|
|
# #
|
|
# # # 将 Word 文档保存为 HTML 文件
|
|
# # document.SaveToFile("1223.html", FileFormat.Html)
|
|
# # #
|
|
# document.Close()
|
|
from bs4 import BeautifulSoup
|
|
# 读取HTML文件
|
|
with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file:
|
|
html_content = file.read()
|
|
|
|
# 解析HTML文档
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# 用于存储结果的字典
|
|
headings = {}
|
|
current_heading = None
|
|
|
|
# 遍历所有的h1, h2, h3等标题
|
|
for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]):
|
|
level = int(element.name[1]) # 获取标题级别
|
|
title = element.get_text(strip=True) # 获取标题文本
|
|
|
|
# 设置当前标题
|
|
current_heading = {
|
|
'title': title,
|
|
'level': level,
|
|
'content': []
|
|
}
|
|
|
|
# 将当前标题添加到字典中
|
|
headings[title] = current_heading
|
|
|
|
# 寻找当前标题下的内容
|
|
next_element = element.find_next_sibling()
|
|
while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]:
|
|
# 判断内容的标签
|
|
if next_element.name in ['p', 'div']:
|
|
current_heading['content'].append(next_element.get_text(strip=False))
|
|
next_element = next_element.find_next_sibling()
|
|
|
|
# 输出结果
|
|
for heading in headings.values():
|
|
print(f"标题: {heading['title']} (级别: {heading['level']})")
|
|
print("内容:")
|
|
for content in heading['content']:
|
|
print(f" - {content}")
|
|
print()
|
|
|
|
|
|
|