Browse Source

文件优化

master
zhouhaibin 5 months ago
parent
commit
a1ea54d7f9
  1. 0
      UserQueue.py
  2. 158
      checkCompanyName.py
  3. 115
      checkDocumentError.py
  4. 108
      checkPlaceName.py
  5. 113
      checkRepeatText.py
  6. 74
      checkTitleName.py
  7. 121
      main.py
  8. 220
      myLogger.py
  9. 172
      test.py

0
UserQueue.py

158
checkCompanyName.py

@ -1,14 +1,15 @@
# -*- coding:utf-8 -*-
import time
from docx import Document
from paddlenlp import Taskflow
from docx import Document
from qwen_agent.agents import Assistant
import re
import json_repair
import json
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import requests
from myLogger import outLog
import time
def load_from_xml_v2(baseURI, rels_item_xml):
"""
@ -28,51 +29,18 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkCompanyName")
prompt = '''
outLog.logger = logging.getLogger("checkCompanyName")
userLog=None
prompt ='''
.根据上述文本判断是否为具体的公司或组织名称你可以使用工具利用互联网查询
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校行业类型其他]选项中选择答案,
回答格式[{companyName名称,"回答":"答案"}{companyName名称,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
'''
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
# 'model': 'qwen1.5-72b-chat',
'model': "qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
@ -81,32 +49,43 @@ bot = Assistant(llm=llm_cfg,
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"
)
def getDocxToTextAll(name):
docxPath=name
document = Document(docxPath)
docxPath = name
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
words = []
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
if text.strip(): # 非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
text = '\n'.join(words)
# userLog.info("checkCompanyName----保存文件")
# 将文本写入txt文件
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
def companyNameTask(text):
yield "文档公司或组织名称检查---启动中...."
wordtag = Taskflow("knowledge_mining",device_id=0)
batchNum=20
userLog.info("checkCompanyName----启动中....")
batchNum = 20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
@ -122,53 +101,71 @@ def companyNameTask(text):
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}"
wenBen=".".join(chunk)
userLog.info(f"checkCompanyName----文档解析进度:{i + 1}/{num_chunks}")
try:
res = wordtag(wenBen)
wenBen = ".".join(chunk)
url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
headers = {"Content-Type": "application/json"}
data = {
"data": {
"text": wenBen,
}
}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text)
# userLog.info(res)
# print(res)
except Exception as e:
logging.warning(chunk)
logging.warning("文档公司或组织名称检查---词类分析出错",e)
continue
userLog.warning(chunk)
userLog.warning("文档公司或组织名称检查--错别字识别出错\n")
userLog.warning(e)
return
isplace = False
for zuhe in res[0]['items']:
for zuhe in res["result"]:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
zhi = zuhe.get("wordtag_label")
if isplace:
name = placeList[len(placeList) - 1]
if zhi.find("组织机构类") >= 0: # or zuhe[1] == "ns"
if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns"
isplace = True
new_text = zuhe['item'].replace("\n", "")
new_text = zuhe[0].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
if zhi.find("组织机构类") >= 0:
if zuhe[1].find("组织机构类") >= 0:
isplace = True
new_text = zuhe['item'].replace("\n", "")
new_text = zuhe[0].replace("\n", "")
placeList.append(new_text)
else:
isplace = False
# 打印总份数
yield "文档公司或组织名称检查---文档解析完成"
placeList=list(dict.fromkeys(placeList))
userLog.info("checkCompanyName----文档解析完成")
placeList = list(dict.fromkeys(placeList))
yield placeList
def checkCompanyName(filename):
userLog.info(placeList)
def checkCompanyName(filename,user_id):
yield f"文档公司或组织名称检查---开始处理文档..."
global userLog
userLog=outLog.get_queue(user_id, "checkCompanyName")
try:
getDocxToTextAll(filename)
except Exception as e:
logging.warning(e)
userLog.warning(e)
userLog.warning("文档公司或组织名称检查---文档无法打开,请检查文档内容")
yield "文档公司或组织名称检查---文档无法打开,请检查文档内容"
outLog.mark_done(user_id, "checkCompanyName")
return
with open("checkCompanyName.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送
userLog.info("checkCompanyName----开始解析文档...")
for item in companyNameTask(gettext):
if isinstance(item, str):
yield item
else:
final_list = item # 获取最终结果
propnStr = ",".join(final_list)
messages = [{'role': 'user', 'content': [{'text': propnStr+prompt}]}]
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
runList = []
yield f"文档公司或组织名称检查---结果生成中..." # 每次生成一个数字就发送
cishu = 0
@ -177,29 +174,34 @@ def checkCompanyName(filename):
if cishu > 3:
cishu = 0
yield "文档公司或组织名称检查---结果生成中" + '.' * cishu
userLog.info(f"checkCompanyName----结果生成中" + '.' * cishu)
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
error_places=[]
error_places = []
for place in parsed_data:
try:
if place['回答'] == '非泛化的公司或组织名称':
error_places.append(place)
except Exception as e:
logging.warning(place)
logging.warning("文档公司或组织名称检查---组织提出出错",e)
userLog.warning(place)
userLog.warning(e)
userLog.warning("文档公司或组织名称检查---组织提出出错")
continue
logging.info(error_places)
userLog.info(error_places)
returnInfo = "发现异常公司或组织名称<br>"
if len(error_places)>0:
if len(error_places) > 0:
for t in error_places:
keyword= t['companyName'].replace("\n","")
# 查找包含关键字的段落
keyword = t['companyName'].replace("\n", "")
# 查找包含关键字的段落
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
t["yuanwen"]=paragraphs[0]
yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n","")
t["yuanwen"] = paragraphs[0]
yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "")
returnInfo += "原文:" + yuanwen + "<br>异常公司或组织名称:**" + keyword + "**!请注意" + "<br>"
logging.info(returnInfo)
userLog.info(returnInfo)
yield returnInfo
else:
yield "**未发现异常公司或组织名称**<br>"
userLog.info("**未发现异常公司或组织名称**<br>")
outLog.mark_done(user_id, "checkCompanyName")

115
checkDocumentError.py

@ -1,19 +1,15 @@
# -*- coding:utf-8 -*-
# from pycorrector import MacBertCorrector
# m = MacBertCorrector("shibing624/macbert4csc-base-chinese")
from qwen_agent.agents import Assistant
from docx import Document
from pprint import pprint
import re
from paddlenlp import Taskflow
import json
import time
import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import asyncio
import requests
from myLogger import outLog
import time
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@ -32,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkDocumentError")
outLog.logger = logging.getLogger("checkDocumentError")
userLog=None
llm_cfg = {
# 'model': 'qwen1.5-72b-chat',
'model': "qwen2-72b",
@ -83,20 +47,28 @@ bot = Assistant(llm=llm_cfg,
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答;
# '''
prompt = '''
请回答以上问题[]选项中选择答案,原文内容标点符号保持不变如果有错请给出解析没有错则不用给解析
请回答以上问题[]选项中选择答案,原文内容标点符号保持不变如果有错请给出详细的解析没有错则不用给解析
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}]不做过多的解释,严格按回答格式作答;
'''
def getDocxToTextAll(name):
userLog.info("checkDocumentError----打开文档")
docxPath = name
document = Document(docxPath)
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容
levelList = []
words = []
addStart = False
levelText = ""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
@ -112,17 +84,23 @@ def getDocxToTextAll(name):
txt_file.write(text)
def getDocumentError(filename):
def checkDocumentError(filename,user_id):
global userLog
userLog=outLog.get_queue(user_id,"checkDocumentError")
yield f"文档纠错---开始处理文档..."
userLog.info("checkDocumentError----开始处理文档...")
try:
getDocxToTextAll(filename)
except Exception as e:
logger.warning(e)
yield "文档无法打开,请检查文档内容"
userLog.warning(e)
userLog.warning("文档纠错----文档无法打开,请检查文档内容")
yield "文档纠错----文档无法打开,请检查文档内容"
outLog.mark_done(user_id, "checkDocumentError")
return
with open("checkDocumentError.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送
userLog.info("checkDocumentError----开始解析文档...")
final_list = []
for item in documentErrorTask(gettext):
if isinstance(item, str):
@ -135,10 +113,13 @@ def getDocumentError(filename):
yuanwen = i["placeName"].replace("\n", "")
jianyi = i["jianyi"].replace("\n", "")
resInfo += "原文:" + yuanwen + "<br>建议:**" + jianyi + "**<br>"
userLog.info(resInfo)
yield resInfo
logger.info(resInfo)
else:
yield "**未发现错别字**"
userLog.info("未发现错别字")
outLog.mark_done(user_id,"checkDocumentError")
def documentErrorTask(text):
@ -149,7 +130,7 @@ def documentErrorTask(text):
:return: 生成器每次返回一批文本
"""
yield "文档纠错---启动中...."
corrector = Taskflow("text_correction", device_id=1)
userLog.info("checkDocumentError----启动中....")
batchNum = 20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
@ -162,18 +143,27 @@ def documentErrorTask(text):
# 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
placeList = []
# 打印每一份的内容
err = []
for i, chunk in enumerate(chunks):
yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"
userLog.info(f"checkDocumentError----文档解析进度:{i + 1}/{num_chunks}")
try:
res = corrector(chunk)
url = "http://0.0.0.0:8190/taskflow/checkDocumentError"
headers = {"Content-Type": "application/json"}
data = {
"data": {
"text": chunk,
}
}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text)
# print(res)
except Exception as e:
logger.warning(chunk)
logger.warning("文档纠错--错别字识别出错\n", e)
userLog.warning(chunk)
userLog.warning("文档纠错--错别字识别出错\n", e)
continue
lines_with_greeting = [place for place in res if len(place['errors']) > 0]
lines_with_greeting = [place for place in res["result"] if len(place['errors']) > 0]
if len(lines_with_greeting) > 0:
num = 0
wenti = [] # 记录问题的数组
@ -186,18 +176,20 @@ def documentErrorTask(text):
for key, value in item['correction'].items():
temp_errorWords.append(key)
wenti.append(
"{}原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
"序号:{}原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
num += 1
words = "\n".join(wenti)
messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]
runList = []
yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送
userLog.info(f"checkDocumentError----内容解析中...")
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
if cishu > 3:
cishu = 0
yield "文档纠错---内容解析中" + '.' * cishu
userLog.info(f"checkDocumentError----内容解析中内容解析中" + '.' * cishu)
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))
@ -209,12 +201,13 @@ def documentErrorTask(text):
place["jianyi"] = place["解析"]
resListerr.append(place)
except Exception as e:
logger.warning(parsed_data)
logger.warning(place)
logger.warning("文档纠错--错别字提取出错\n", e)
userLog.warning(parsed_data)
userLog.warning(place)
userLog.warning("文档纠错--错别字提取出错\n", e)
continue
if (len(resListerr) > 0):
err.extend(resListerr)
# 打印总份数
yield "文档地名检查---文档解析完成"
userLog.info(err)
yield err

108
checkPlaceName.py

@ -1,15 +1,15 @@
from docx import Document
from paddlenlp import Taskflow
from pprint import pprint
from qwen_agent.agents import Assistant
import re
import json_repair
import time
import json
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import requests
import logging
from myLogger import outLog
import time
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@ -29,45 +29,10 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkPlaceName")
outLog.logger = logging.getLogger("checkPlaceName")
userLog=None
prompt='''
.上述文本判断地名是否正确你可以使用工具利用互联网查询你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{placeName:地名,"回答":"答案"},{placeName:地名,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
.上述文本判断地名是否正确你可以使用工具利用互联网查询你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{placeName:地名,"回答":"答案"},{placeName:地名,"回答":"答案"},{placeName:地名,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
不做过多的解释,严格按回答格式作答;
'''
# prompt='''
@ -87,7 +52,18 @@ bot = Assistant(llm=llm_cfg,
)
#获取全文内容
def getDocxToTextAll(docxPath):
document = Document(docxPath)
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容
levelList=[]
words=[]
@ -111,7 +87,7 @@ def getDocxToTextAll(docxPath):
#得到全文和地名有关的内容
def placeNameTask(text):
yield "文档地名检查---启动中...."
tagTask = Taskflow("ner",device_id=2)
userLog.info("checkPlaceName----启动中....")
batchNum=20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
@ -128,16 +104,25 @@ def placeNameTask(text):
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
userLog.info(f"checkPlaceName----文档解析进度:{i + 1}/{num_chunks}")
wenBen=".".join(chunk)
try:
res = tagTask(wenBen)
url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
headers = {"Content-Type": "application/json"}
data = {
"data": {
"text": wenBen,
}
}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text)
except Exception as e:
logger.warning(chunk)
logger.warning("文档地名检查---解析地名出错",e)
userLog.warning(chunk)
userLog.warning("文档地名检查---解析地名出错")
userLog.warning(e)
continue
isplace = False
for zuhe in res:
for zuhe in res["result"]:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
if isplace:
name = placeList[len(placeList) - 1]
@ -154,16 +139,22 @@ def placeNameTask(text):
isplace = False
# 打印总份数
yield "文档地名检查---文档解析完成"
userLog.info("checkPlaceName---文档解析完成")
placeList=list(dict.fromkeys(placeList))
yield placeList
#主方法
def checkPlaceName(filename):
def checkPlaceName(filename,user_id):
global userLog
userLog=outLog.get_queue(user_id,"checkPlaceName")
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送
try:
getDocxToTextAll(filename)
except Exception as e:
logger.warning(e)
userLog.warning(e)
yield "文档地名检查---文档无法打开,请检查文档内容"
userLog.warning("文档地名检查---文档无法打开,请检查文档内容")
outLog.mark_done(user_id,"checkPlaceName")
return
with open("checkPlaceName.txt", "r",encoding='utf-8') as f:
gettext = f.read()
@ -184,6 +175,7 @@ def checkPlaceName(filename):
if cishu>3:
cishu=0
yield "文档地名检查---结果生成中"+'.'*cishu
userLog.info("checkPlaceName---结果生成中"+'.'*cishu)
cishu+=1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
@ -194,10 +186,12 @@ def checkPlaceName(filename):
if place['回答'] == '错误':
error_places.append(place)
except Exception as e:
logger.warning(place)
logger.warning("文档地名检查---组织提出出错",e)
userLog.warning(parsed_data)
userLog.warning(place)
userLog.warning("文档地名检查---组织提出出错")
userLog.warning(e)
continue
logger.info(error_places)
userLog.info(error_places)
returnInfo = "发现异常地名<br>"
if len(error_places)>0:
for t in error_places:
@ -206,7 +200,9 @@ def checkPlaceName(filename):
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
returnInfo+="原文:" + yuanwen + "<br>出现异常地名:**" + keyword + "**!请注意" + "<br>"
userLog.info(returnInfo)
yield returnInfo
logger.info(returnInfo)
else:
yield "**未发现发现异常地名**"
userLog.info("未发现发现异常地名")
outLog.mark_done(user_id, "checkPlaceName")

113
checkRepeatText.py

@ -5,7 +5,7 @@ from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qwen_agent.agents import Assistant
import json_repair
from paddlenlp import Taskflow
import json
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
device_id=0
import re
@ -16,41 +16,11 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import logging
import logging.config
import requests
from myLogger import outLog
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkRepeatText")
outLog.logger = logging.getLogger("checkRepeatText")
userLog=None
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@ -110,7 +80,18 @@ def isTitle(paragraph):
#寻找标题名称
def findTitleName(docxPath):
yield '文档相似性检查----检查是否存在详细设计方案'
document = Document(docxPath)
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容
titleWords=[]
firstTitle = 0
@ -161,14 +142,24 @@ def findTitleName(docxPath):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
logger.info(parsed_data)
if(parsed_data["answer"]=="存在"):
yield parsed_data["name"]
else:
yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
#获取文档中 详细设计方案 章节的所有内容
def getDocxToText(docxPath,titleName,vector_store_path):
document = Document(docxPath)
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容
levelList=[]
words=[]
@ -228,7 +219,9 @@ def getDocxToText(docxPath,titleName,vector_store_path):
# @app.route('/checkRepeatText/<filename>', methods=['GET'])
def checkRepeatText(filename):
def checkRepeatText(filename,user_id):
global userLog
userLog=outLog.get_queue(user_id,"checkRepeatText")
yield "文档相似性检查---启动中...."
vector_store_path="vector_store"+str(uuid.uuid4())
for titleName in findTitleName(filename):
@ -239,13 +232,11 @@ def checkRepeatText(filename):
words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path)
except Exception as e:
yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败"
userLog.warning(e)
userLog.warning(f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败")
outLog.mark_done(user_id, "checkRepeatText")
return
# 记录程序开始的时间戳‘
global device_id
similarity = Taskflow("text_similarity",device_id=3)
# device_id+=1
# if(device_id>1):
# device_id=0
reslist = []
count = 0
for i in words:
@ -259,12 +250,23 @@ def checkRepeatText(filename):
if (textTag.find(tag) >= 0):
continue
try:
res = similarity([[i[i.find('') + 1:], text[text.find('') + 1:]]])
url = "http://0.0.0.0:8192/taskflow/checkRepeatText"
headers = {"Content-Type": "application/json"}
data = {
"data": {
"text": [[i[i.find('') + 1:], text[text.find('') + 1:]]],
}
}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text)
# res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]])
except Exception as e:
logger.warning("文档相似性检查--发生异常:",e)
logger.warning(i)
logger.warning(text)
if (res[0]["similarity"] > 0.90):
userLog.warning("文档相似性检查--发生异常:")
userLog.warning(e)
userLog.warning(i)
userLog.warning(text)
continue
if (res["result"][0]["similarity"] > 0.90):
# 判断重复内容是否被放入
if (len(reslist) > 0):
isExist = False
@ -274,19 +276,20 @@ def checkRepeatText(filename):
break
if not isExist:
# reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]})
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
userLog.info("【在"+i[:i.find('')].replace("\n","")+"下包含:"+i[i.find('') + 1:].replace("\n","")+"<br>在"+text[:text.find('')].replace("\n","")+"**下包含:"+text[text.find('') + 1:].replace("\n","")+"<br>以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"")
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
else:
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
# print(i.split(":")[1] + "\n" + text.split(":")[1])
userLog.info("【在"+i[:i.find('')].replace("\n","")+"下包含:"+i[i.find('') + 1:].replace("\n","")+"<br>在"+text[:text.find('')].replace("\n","")+"**下包含:"+text[text.find('') + 1:].replace("\n","")+"<br>以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"")
# vectorstore.delete(ids=uuids)
shutil.rmtree(vector_store_path)
logger.info("已删除")
logger.info(reslist)
resInfo=f"{titleName}章节,发现相似内容:<br>"
if(len(reslist)>0):
for res in reslist:
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find('')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find('') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find('')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find('') + 1:]+"<br>以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】<br>"
yield resInfo
logger.info(resInfo)
else:
yield "未发现相似内容"
yield "**未发现相似内容**"
userLog.info("文档相似性检查----未发现相似内容**")
outLog.mark_done(user_id, "checkRepeatText")

74
checkTitleName.py

@ -1,3 +1,5 @@
import time
from docx import Document
from pprint import pprint
from qwen_agent.agents import Assistant
@ -6,7 +8,7 @@ import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
from myLogger import outLog
def load_from_xml_v2(baseURI, rels_item_xml):
"""
@ -26,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkCompanyName")
outLog.logger = logging.getLogger("checkTitleName")
userLog=None
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
@ -113,7 +83,18 @@ def isTitle(paragraph):
#获取文档中 详细设计方案 章节的所有内容
def getDocxToTitleName(docxPath):
document = Document(docxPath)
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容
levelList=[]
words=[]
@ -130,9 +111,11 @@ def getDocxToTitleName(docxPath):
words.append(text)
return words
def checkTitleName(filename):
def checkTitleName(filename,user_id):
global userLog
userLog=outLog.get_queue(user_id,"checkTitleName")
yield '文档结构检查----启动中'
userLog.info("checkTitleName----启动中")
with open("ce模板.txt", "r",encoding='utf-8') as f:
gettext = f.readlines()
count=0
@ -140,8 +123,10 @@ def checkTitleName(filename):
try:
word = getDocxToTitleName(filename)
except Exception as e:
print(e)
yield "文档无法打开,请检查文档内容"
userLog.warning(e)
yield "文档结构检查----文档无法打开,请检查文档内容"
outLog.mark_done(user_id, "checkTitleName")
userLog.warning("checkTitleName----文档无法打开,请检查文档内容")
return
for text in gettext:
count+=1
@ -150,24 +135,25 @@ def checkTitleName(filename):
'''
xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
userLog.info(f"checkTitleName----结构分析中{count}/{len(gettext)}")
strword = "\n".join(word)+prompt+xushang
# print(strword)
messages = [{'role': 'user', 'content': [{'text':strword}]}]
runList = []
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
# print(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
print(parsed_data)
if(parsed_data["answer"]=="不存在"):
reserr.append(text)
resInfo="文档结构存在异常:<br>"
if(len(reserr)>0):
for i in reserr:
resInfo+="**"+i.replace('\n','')+"**<br>"
logger.info(resInfo)
userLog.info(resInfo)
yield resInfo
else:
yield "文档结构未发现异常"
userLog.info("文档结构未发现异常")
outLog.mark_done(user_id, "checkTitleName")

121
main.py

@ -1,18 +1,21 @@
from flask import Flask, request, jsonify,Response
from flask import Flask, request, jsonify, Response
import os
from checkPlaceName import checkPlaceName
from checkRepeatText import checkRepeatText
from checkCompanyName import checkCompanyName
from checkDocumentError import getDocumentError
from checkDocumentError import checkDocumentError
from checkTitleName import checkTitleName
from flask_cors import CORS
import qwen_agenttext
from myLogger import outLog
import time
app = Flask(__name__)
cros = CORS(app)
UPLOAD_FOLDER = 'uploads'
usableTag=[0,0,0,0,0,0,0,0]
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
@ -22,11 +25,13 @@ def upload_file():
return jsonify({"error": "No selected file"}), 400
if file:
filename = file.filename
file.save(os.path.join(UPLOAD_FOLDER,filename))
file.save(os.path.join(UPLOAD_FOLDER, filename))
return jsonify({"message": "File uploaded successfully"}), 200
@app.route('/stream' ,methods=["GET", "POST"])
@app.route('/stream', methods=["GET", "POST"])
def stream_numbers():
context= request.args.get('context')
context = request.args.get('context')
# def generate_numbers():
# event_id=0
# for number in range(1, 10):
@ -50,22 +55,26 @@ def stream_numbers():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(qwen_agenttext.getxinx(context),headers=headers)
return Response(qwen_agenttext.getxinx(context), headers=headers)
@app.route('/sse/checkRepeatText', methods=['GET'])
def checkRepeatTextWeb():
filename = request.args.get('filename')
userId = request.args.get("userId")
def generate_checkRepeatText(filename):
id=0
try:
for i in checkRepeatText(filename):
yield f"id: {id+1}\n"
yield f"event: checkRepeatText\n"
yield f"data: {i}\n\n" # 发送完成信号
except Exception as e:
yield f"id: {id+1}\n"
def generate_checkRepeatText(filename,userId):
id = 0
for i in checkRepeatText(filename,userId):
yield f"id: {id + 1}\n"
yield f"event: checkRepeatText\n"
yield f"data: **程序出现异常**\n\n" # 发送完成信号
yield f"data: {i}\n\n" # 发送完成信号
# except Exception as e:
# yield f"id: {id+1}\n"
# yield f"event: checkRepeatText\n"
# yield f"data: **程序出现异常**\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
@ -74,19 +83,20 @@ def checkRepeatTextWeb():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkRepeatText(filename), headers=headers)
return Response(generate_checkRepeatText(filename,userId), headers=headers)
@app.route('/sse/checkPlaceName', methods=['GET'])
def checkPlaceNameWebSse():
filename = request.args.get('filename')
def generate_checkPlaceName(filename):
id=0
for i in checkPlaceName(filename):
yield f"id: {id+1}\n"
userId = request.args.get("userId")
def generate_checkPlaceName(filename,userId):
id = 0
for i in checkPlaceName(filename,userId):
yield f"id: {id + 1}\n"
yield f"event: checkPlaceName\n"
yield f"data: {i}\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
@ -95,14 +105,16 @@ def checkPlaceNameWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkPlaceName(filename), headers=headers)
return Response(generate_checkPlaceName(filename,userId), headers=headers)
@app.route('/sse/checkCompanyName', methods=['GET'])
def checkCompanyNameWebSse():
filename = request.args.get('filename')
def generate_checkCompanyName(filename):
userId = request.args.get("userId")
def generate_checkCompanyName(filename,userId):
id = 0
for i in checkCompanyName(filename):
for i in checkCompanyName(filename,userId):
yield f"id: {id + 1}\n"
yield f"event: checkCompanyName\n"
yield f"data: {i}\n\n" # 发送完成信号
@ -115,17 +127,18 @@ def checkCompanyNameWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkCompanyName(filename), headers=headers)
return Response(generate_checkCompanyName(filename,userId), headers=headers)
@app.route('/sse/checkDocumentErrorWeb', methods=['GET'])
def checkDocumentErrorWebSse():
filename = request.args.get('filename')
def generate_checkDocumentError(filename):
userId = request.args.get("userId")
def generate_checkDocumentError(filename,userId):
id = 0
for i in getDocumentError(filename):
for i in checkDocumentError(filename,userId):
yield f"id: {id + 1}\n"
yield f"event: getDocumentError\n"
yield f"event: checkDocumentError\n"
yield f"data: {i}\n\n" # 发送完成信号
headers = {
@ -136,14 +149,16 @@ def checkDocumentErrorWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkDocumentError(filename), headers=headers)
return Response(generate_checkDocumentError(filename,userId), headers=headers)
@app.route('/sse/checkTitleName', methods=['GET'])
def checkTitleNameWebSse():
filename = request.args.get('filename')
def generate_checkTitleName(filename):
userId = request.args.get("userId")
def generate_checkTitleName(filename,userId):
id = 0
for i in checkTitleName(filename):
for i in checkTitleName(filename,userId):
yield f"id: {id + 1}\n"
yield f"event: checkTitleName\n"
yield f"data: {i}\n\n" # 发送完成信号
@ -156,6 +171,36 @@ def checkTitleNameWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkTitleName(filename), headers=headers)
return Response(generate_checkTitleName(filename,userId), headers=headers)
@app.route('/sse/getLog', methods=['GET'])
def getlog():
userId = request.args.get("userId")
def generate_getLog(userId):
time.sleep(1)
id = 0
while True:
if outLog.is_done(userId):
break
q = outLog.get_queueData(userId)
if q:
id+=1
text = q.pop(0)
yield f"id: {id}\n"
yield f"event: getlog\n"
yield f"data: {text}\n\n" # 发送完成信号
yield f"id: {id}\n"
yield f"event: getlog\n"
yield f"data: 任务结束!!!!!\n\n" # 发送完成信号
outLog.del_queue(userId)
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_getLog(userId), headers=headers)
if __name__ == '__main__':
app.run(host="0.0.0.0",port=80)
app.run(host="0.0.0.0", port=80)

220
myLogger.py

@ -0,0 +1,220 @@
# -*- coding: utf-8 -*-
"""
@author: bingyl123@163.com
@version: 1.0.0
@file: OutLog.py
@time: 2023/2/23 20:25
"""
# import logging
# import logging.config
# import re
# import datetime
# import queue
#
#
# class OutLog:
# _instance = None
# logger = None
#
# def __new__(cls):
# if cls._instance is None:
# cls._instance = super(OutLog, cls).__new__(cls)
# cls.logger = logging.getLogger("app") # 默认logger名称为"app"
# cls._instance.queue_dict = {}
# cls._instance.done_dict = {}
# return cls._instance
#
# def get_queue(self, user_id):
# if user_id not in self.queue_dict:
# self.queue_dict[user_id] = []
# self.done_dict[user_id] = {} # 初始化为未完成的字典
# return self.queue_dict[user_id]
#
# def mark_done(self, user_id, producer_name):
# self.done_dict[user_id][producer_name] = True
#
# def is_done(self, user_id):
# return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
# @staticmethod
# def put(item: str, level="INFO"):
# dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# mq.put(f"{dtf}[{level}]: {item}")
#
# @staticmethod
# def debug(item, log=True):
# OutLog.put(item, level="DEBUG")
# if log:
# OutLog._instance.logger.debug(item)
#
# @staticmethod
# def info(item, log=True):
# OutLog.put(item, level="INFO")
# if log:
# OutLog._instance.logger.info(item)
#
# @staticmethod
# def warning(item, log=True):
# OutLog.put(item, level="WARNING")
# if log:
# OutLog._instance.logger.warning(item)
#
# @staticmethod
# def error(item, log=True):
# OutLog.put(item, level="ERROR")
# if log:
# OutLog._instance.logger.error(item)
#
# @staticmethod
# def critical(item, log=True):
# OutLog.put(item, level="CRITICAL")
# if log:
# OutLog._instance.logger.critical(item)
#
#
#
# # 日志配置
# log_config = {
# 'version': 1,
# 'disable_existing_loggers': False,
# 'formatters': {
# 'standard': {
# 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
# },
# },
# 'handlers': {
# 'console': {
# 'class': 'logging.StreamHandler',
# 'formatter': 'standard',
# 'level': logging.INFO,
# },
# 'file': {
# 'class': 'logging.FileHandler',
# 'filename': 'Logger.log',
# 'formatter': 'standard',
# 'level': logging.WARNING,
# },
# },
# 'loggers': {
# '': {
# 'handlers': ['console', 'file'],
# 'level': logging.WARNING,
# 'propagate': True,
# },
# }
# }
#
# logging.config.dictConfig(log_config)
#
# outLog = OutLog() # 获取单例实例
import logging
import logging.config
import datetime
class OutLog:
_instance = None
logger = None
def __new__(cls):
if cls._instance is None:
cls._instance = super(OutLog, cls).__new__(cls)
cls.logger = logging.getLogger("app") # 默认logger名称为"app"
cls._instance.queue_dict = {}
cls._instance.done_dict = {}
return cls._instance
def get_queue(self, user_id,producer_name):
if user_id not in self.queue_dict:
self.queue_dict[user_id] = []
self.done_dict[user_id] = {} # 初始化为未完成的字典
if user_id not in self.done_dict:
self.done_dict[user_id][producer_name] = False
return self.UserLogger(user_id)
def get_queueData(self, user_id):
if user_id in self.queue_dict:
return OutLog._instance.queue_dict[self.user_id]
def del_queue(self,user_id):
if self.is_done(user_id):
del self.queue_dict[user_id]
del self.done_dict[user_id]
class UserLogger:
def __init__(self, user_id):
self.user_id = user_id
self.logger = OutLog._instance.logger
def log(self, item: str, level: str):
dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_entry = f"{dtf}[{level}]: {item}"
OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列
self._log_to_logger(item, level)
def _log_to_logger(self, item: str, level: str):
if level == "DEBUG":
self.logger.debug(item)
elif level == "INFO":
self.logger.info(item)
elif level == "WARNING":
self.logger.warning(item)
elif level == "ERROR":
self.logger.error(item)
elif level == "CRITICAL":
self.logger.critical(item)
def info(self, item: str):
self.log(item, "INFO")
def warning(self, item: str):
self.log(item, "WARNING")
def debug(self, item: str):
self.log(item, "DEBUG")
def error(self, item: str):
self.log(item, "ERROR")
def critical(self, item: str):
self.log(item, "CRITICAL")
def mark_done(self, user_id, producer_name):
self.done_dict[user_id][producer_name] = True
def is_done(self, user_id):
return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
# 日志配置
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.WARNING,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.WARNING,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
outLog = OutLog() # 获取单例实例

172
test.py

@ -1,109 +1,79 @@
import time
import json
import math
from flask import Flask,Response,request
from flask_sse import sse
from flask_cors import CORS
import re
import qwen_agenttext
app = Flask(__name__)
cros = CORS(app)
# SSE 推送函数
import paddle;
paddle.device.get_available_device()
# -*- coding:utf-8 -*-
# from spire.doc import *
# from spire.doc.common import *
#
# # 创建一个 Document 对象
# document = Document()
# # 加载一个 Word DOCX 文档
# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc")
# document.LoadFromFile("D:\\数据集\\数据集\\3.doc")
# print(document.Sections.Count)
# for i in range(document.Sections.Count):
# section=document.Sections[i]
# for x in range(section.Paragraphs.Count):
# paragraph=section.Paragraphs[x]
# print(paragraph.Text)
# print("---------------------------------")
# # 或加载一个 Word DOC 文档
# # document.LoadFromFile("1223.xml")
#
# # # # 设置是否在 HTML 中嵌入图片
# # document.HtmlExportOptions.ImageEmbedded = True
# # # document.XHTMLValidateOption.ImageEmbedded = True
# # #
# # # # 设置是否将表单字段导出为纯文本在 HTML 中显示
# # document.HtmlExportOptions.IsTextInputFormFieldAsText = True
# # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True
# # #
# # # # 设置是否在 HTML 中导出页眉和页脚
# # document.HtmlExportOptions.HasHeadersFooters = False
# # # document.XHTMLValidateOption.HasHeadersFooters = True
# #
# # # 将 Word 文档保存为 HTML 文件
# # document.SaveToFile("1223.html", FileFormat.Html)
# # #
# document.Close()
from bs4 import BeautifulSoup
# 读取HTML文件
with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file:
html_content = file.read()
# 解析HTML文档
soup = BeautifulSoup(html_content, 'html.parser')
# SSE 推送路由
# 用于存储结果的字典
headings = {}
current_heading = None
# 遍历所有的h1, h2, h3等标题
for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]):
level = int(element.name[1]) # 获取标题级别
title = element.get_text(strip=True) # 获取标题文本
# @app.route('/register', methods=["GET"])
# def register():
# 获取客户端标识符
# client_id = str(uuid.uuid4())
#
# # 返回 SSE 响应
# return jsonify({"client_id": client_id})
# 设置当前标题
current_heading = {
'title': title,
'level': level,
'content': []
}
# 将当前标题添加到字典中
headings[title] = current_heading
# SSE 推送路由
# 寻找当前标题下的内容
next_element = element.find_next_sibling()
while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]:
# 判断内容的标签
if next_element.name in ['p', 'div']:
current_heading['content'].append(next_element.get_text(strip=False))
next_element = next_element.find_next_sibling()
# 输出结果
for heading in headings.values():
print(f"标题: {heading['title']} (级别: {heading['level']})")
print("内容:")
for content in heading['content']:
print(f" - {content}")
print()
# @app.route('/sse', methods=['POST'])
# def stream():
# # 获取客户端标识符
# client_id = 1
# print("client_id", client_id)
#
# def aa():
# # 循环发送 SSE 数据
# for i in range(10):
# data = 'Hello, %s!' % client_id + str(i)
# print(data)
# sse.publish(data, channel=client_id, type='message')
# time.sleep(1)
# sse.publish("end", channel=client_id, type='message')
#
# # 返回 SSE 响应
# response = Response(aa(), mimetype='text/event-stream')
# response.headers.add('Cache-Control', 'no-cache')
# response.headers.add('Connection', 'keep-alive')
# response.headers.add('X-Accel-Buffering', 'no')
# return response
#
#
#
# @app.route('/stream' ,methods=["GET", "POST"])
# def stream_numbers():
# context= request.args.get('context')
#
#
# headers = {
# "Content-Type": "text/event-stream",
# "Cache-Control": "no-cache",
# "X-Accel-Buffering": "no",
# "Access-Control-Allow-Origin": "*",
# "Access-Control-Allow-Methods": "GET,POST",
# "Access-Control-Allow-Headers": "x-requested-with,content-type",
# }
# return Response(generate_numbers(),headers=headers)
# def generate_numbers():
# event_id=0
# # for number in range(1, 10):
# # json_data = json.dumps({"number": number})
# # print(json_data)
# # event_id += 1
# # yield f"id: {event_id}\n"
# # yield f"event: time-update\n"
# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送
# json_data = json.dumps({"number": "done"})
# yield f"id: {1}\n"
# yield f"event: time-update\n"
# yield f"data: 34568\n\n" # 发送完成信号
# if __name__ == '__main__':
#
#
# # 读取文件内容
# with open("checkPlaceName.txt", "r", encoding='utf-8') as f:
# gettext = f.read()
# batchNum=20
# sentences = re.split(r'[。\n]', gettext)
# # 去掉空字符
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# # 计算总字符数
# total_chars = len(sentences)
#
# # 计算有多少份
# num_chunks = math.ceil(total_chars / batchNum)
#
# # 按batchNum字为一份进行处理
# chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
#
# # 打印每一份的内容
# for i, chunk in enumerate(chunks):
# print(f"Chunk {i + 1}:")
# print(chunk)
# print("-" * 40)
#
# # 打印总份数
# print(f"Total chunks: {num_chunks}")
# app.run(debug=True,port=80)
Loading…
Cancel
Save