diff --git a/UserQueue.py b/UserQueue.py new file mode 100644 index 0000000..e69de29 diff --git a/checkCompanyName.py b/checkCompanyName.py index ea80b47..4d2f1fd 100644 --- a/checkCompanyName.py +++ b/checkCompanyName.py @@ -1,14 +1,15 @@ # -*- coding:utf-8 -*- -import time -from docx import Document -from paddlenlp import Taskflow +from docx import Document from qwen_agent.agents import Assistant import re import json_repair +import json import math from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml - +import requests +from myLogger import outLog +import time def load_from_xml_v2(baseURI, rels_item_xml): """ @@ -28,51 +29,18 @@ def load_from_xml_v2(baseURI, rels_item_xml): _SerializedRelationships.load_from_xml = load_from_xml_v2 - import logging -import logging.config -log_config = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - }, - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'standard', - 'level': logging.INFO, - }, - 'file': { - 'class': 'logging.FileHandler', - 'filename': 'Logger.log', - 'formatter': 'standard', - 'level': logging.INFO, - }, - }, - 'loggers': { - '': { - 'handlers': ['console', 'file'], - 'level': logging.INFO, - 'propagate': True, - }, - } -} - -logging.config.dictConfig(log_config) - -logger = logging.getLogger("checkCompanyName") -prompt = ''' +outLog.logger = logging.getLogger("checkCompanyName") +userLog=None +prompt =''' .根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询, 你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案, 回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; ''' llm_cfg = { - #'model': 'qwen1.5-72b-chat', - 'model':"qwen2-72b", + # 'model': 'qwen1.5-72b-chat', + 'model': "qwen2-72b", 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', } @@ -81,32 +49,43 @@ bot = Assistant(llm=llm_cfg, # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具" ) + def getDocxToTextAll(name): - docxPath=name - document = Document(docxPath) + docxPath = name + loopCount = 0 + while True: + loopCount+=1 + if(loopCount>=15): + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass # 逐段读取docx文档的内容 - levelList=[] - words=[] - addStart = False - levelText="" + words = [] i = 0 for paragraph in document.paragraphs: # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 text = paragraph.text - if text.strip():#非空判断 + if text.strip(): # 非空判断 # print("非空") words.append(text) # 将所有段落文本拼接成一个字符串,并用换行符分隔 text = '\n'.join(words) - + # userLog.info("checkCompanyName----保存文件") # 将文本写入txt文件 with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file: txt_file.write(text) + + def companyNameTask(text): yield "文档公司或组织名称检查---启动中...." - wordtag = Taskflow("knowledge_mining",device_id=0) - batchNum=20 + userLog.info("checkCompanyName----启动中....") + batchNum = 20 sentences = re.split(r'[。\n]', text) # 去掉空字符 sentences = [sentence.strip() for sentence in sentences if sentence.strip()] @@ -122,53 +101,71 @@ def companyNameTask(text): # 打印每一份的内容 for i, chunk in enumerate(chunks): yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}" - - wenBen=".".join(chunk) + userLog.info(f"checkCompanyName----文档解析进度:{i + 1}/{num_chunks}") try: - res = wordtag(wenBen) + wenBen = ".".join(chunk) + url = "http://0.0.0.0:8191/taskflow/checkPlaceName" + headers = {"Content-Type": "application/json"} + data = { + "data": { + "text": wenBen, + } + } + r = requests.post(url=url, headers=headers, data=json.dumps(data)) + res = json.loads(r.text) + # userLog.info(res) + # print(res) except Exception as e: - logging.warning(chunk) - logging.warning("文档公司或组织名称检查---词类分析出错",e) - continue + userLog.warning(chunk) + userLog.warning("文档公司或组织名称检查--错别字识别出错\n") + userLog.warning(e) + return isplace = False - for zuhe in res[0]['items']: + for zuhe in res["result"]: # 上一个的地名,这一个还是地名,就和上一个相加代替这个 - zhi = zuhe.get("wordtag_label") if isplace: name = placeList[len(placeList) - 1] - if zhi.find("组织机构类") >= 0: # or zuhe[1] == "ns" + if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns" isplace = True - new_text = zuhe['item'].replace("\n", "") + new_text = zuhe[0].replace("\n", "") placeList[len(placeList) - 1] = name + new_text continue - if zhi.find("组织机构类") >= 0: + if zuhe[1].find("组织机构类") >= 0: isplace = True - new_text = zuhe['item'].replace("\n", "") + new_text = zuhe[0].replace("\n", "") placeList.append(new_text) else: isplace = False # 打印总份数 yield "文档公司或组织名称检查---文档解析完成" - placeList=list(dict.fromkeys(placeList)) + userLog.info("checkCompanyName----文档解析完成") + placeList = list(dict.fromkeys(placeList)) yield placeList -def checkCompanyName(filename): + userLog.info(placeList) + +def checkCompanyName(filename,user_id): yield f"文档公司或组织名称检查---开始处理文档..." + global userLog + userLog=outLog.get_queue(user_id, "checkCompanyName") try: getDocxToTextAll(filename) except Exception as e: - logging.warning(e) + userLog.warning(e) + userLog.warning("文档公司或组织名称检查---文档无法打开,请检查文档内容") yield "文档公司或组织名称检查---文档无法打开,请检查文档内容" + outLog.mark_done(user_id, "checkCompanyName") return with open("checkCompanyName.txt", "r", encoding='utf-8') as f: gettext = f.read() yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送 + userLog.info("checkCompanyName----开始解析文档...") for item in companyNameTask(gettext): if isinstance(item, str): yield item else: final_list = item # 获取最终结果 propnStr = ",".join(final_list) - messages = [{'role': 'user', 'content': [{'text': propnStr+prompt}]}] + messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}] runList = [] yield f"文档公司或组织名称检查---结果生成中..." # 每次生成一个数字就发送 cishu = 0 @@ -177,29 +174,34 @@ def checkCompanyName(filename): if cishu > 3: cishu = 0 yield "文档公司或组织名称检查---结果生成中" + '.' * cishu + userLog.info(f"checkCompanyName----结果生成中" + '.' * cishu) cishu += 1 data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) - error_places=[] + error_places = [] + for place in parsed_data: try: if place['回答'] == '非泛化的公司或组织名称': error_places.append(place) except Exception as e: - logging.warning(place) - logging.warning("文档公司或组织名称检查---组织提出出错",e) + userLog.warning(place) + userLog.warning(e) + userLog.warning("文档公司或组织名称检查---组织提出出错") continue - logging.info(error_places) + userLog.info(error_places) returnInfo = "发现异常公司或组织名称
" - if len(error_places)>0: + if len(error_places) > 0: for t in error_places: - keyword= t['companyName'].replace("\n","") - # 查找包含关键字的段落 + keyword = t['companyName'].replace("\n", "") + # 查找包含关键字的段落 paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) - t["yuanwen"]=paragraphs[0] - yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n","") + t["yuanwen"] = paragraphs[0] + yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "") returnInfo += "原文:" + yuanwen + "
异常公司或组织名称:**" + keyword + "**!请注意" + "
" - logging.info(returnInfo) + userLog.info(returnInfo) yield returnInfo else: - yield "**未发现异常公司或组织名称**
" \ No newline at end of file + yield "**未发现异常公司或组织名称**
" + userLog.info("**未发现异常公司或组织名称**
") + outLog.mark_done(user_id, "checkCompanyName") \ No newline at end of file diff --git a/checkDocumentError.py b/checkDocumentError.py index 2f4614b..33d7ed4 100644 --- a/checkDocumentError.py +++ b/checkDocumentError.py @@ -1,19 +1,15 @@ # -*- coding:utf-8 -*- -# from pycorrector import MacBertCorrector -# m = MacBertCorrector("shibing624/macbert4csc-base-chinese") from qwen_agent.agents import Assistant from docx import Document -from pprint import pprint import re -from paddlenlp import Taskflow import json -import time import json_repair import math from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml - -import asyncio +import requests +from myLogger import outLog +import time def load_from_xml_v2(baseURI, rels_item_xml): """ Return |_SerializedRelationships| instance loaded with the @@ -32,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml): _SerializedRelationships.load_from_xml = load_from_xml_v2 import logging -import logging.config - -log_config = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - }, - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'standard', - 'level': logging.INFO, - }, - 'file': { - 'class': 'logging.FileHandler', - 'filename': 'Logger.log', - 'formatter': 'standard', - 'level': logging.INFO, - }, - }, - 'loggers': { - '': { - 'handlers': ['console', 'file'], - 'level': logging.INFO, - 'propagate': True, - }, - } -} -logging.config.dictConfig(log_config) - -logger = logging.getLogger("checkDocumentError") +outLog.logger = logging.getLogger("checkDocumentError") +userLog=None llm_cfg = { # 'model': 'qwen1.5-72b-chat', 'model': "qwen2-72b", @@ -83,20 +47,28 @@ bot = Assistant(llm=llm_cfg, # 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答; # ''' prompt = ''' -请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出解析,没有错则不用给解析 +请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出详细的解析,没有错则不用给解析 回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答; ''' def getDocxToTextAll(name): + userLog.info("checkDocumentError----打开文档") docxPath = name - document = Document(docxPath) + loopCount = 0 + while True: + loopCount+=1 + if(loopCount>=15): + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass # 逐段读取docx文档的内容 - levelList = [] words = [] - addStart = False - levelText = "" - i = 0 for paragraph in document.paragraphs: # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 @@ -112,17 +84,23 @@ def getDocxToTextAll(name): txt_file.write(text) -def getDocumentError(filename): +def checkDocumentError(filename,user_id): + global userLog + userLog=outLog.get_queue(user_id,"checkDocumentError") yield f"文档纠错---开始处理文档..." + userLog.info("checkDocumentError----开始处理文档...") try: getDocxToTextAll(filename) except Exception as e: - logger.warning(e) - yield "文档无法打开,请检查文档内容" + userLog.warning(e) + userLog.warning("文档纠错----文档无法打开,请检查文档内容") + yield "文档纠错----文档无法打开,请检查文档内容" + outLog.mark_done(user_id, "checkDocumentError") return with open("checkDocumentError.txt", "r", encoding='utf-8') as f: gettext = f.read() yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送 + userLog.info("checkDocumentError----开始解析文档...") final_list = [] for item in documentErrorTask(gettext): if isinstance(item, str): @@ -135,10 +113,13 @@ def getDocumentError(filename): yuanwen = i["placeName"].replace("\n", "") jianyi = i["jianyi"].replace("\n", "") resInfo += "原文:" + yuanwen + "
建议:**" + jianyi + "**
" + userLog.info(resInfo) yield resInfo - logger.info(resInfo) + else: yield "**未发现错别字**" + userLog.info("未发现错别字") + outLog.mark_done(user_id,"checkDocumentError") def documentErrorTask(text): @@ -149,7 +130,7 @@ def documentErrorTask(text): :return: 生成器,每次返回一批文本 """ yield "文档纠错---启动中...." - corrector = Taskflow("text_correction", device_id=1) + userLog.info("checkDocumentError----启动中....") batchNum = 20 sentences = re.split(r'[。\n]', text) # 去掉空字符 @@ -162,18 +143,27 @@ def documentErrorTask(text): # 按batchNum字为一份进行处理 chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] - placeList = [] # 打印每一份的内容 err = [] for i, chunk in enumerate(chunks): yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}" + userLog.info(f"checkDocumentError----文档解析进度:{i + 1}/{num_chunks}") try: - res = corrector(chunk) + url = "http://0.0.0.0:8190/taskflow/checkDocumentError" + headers = {"Content-Type": "application/json"} + data = { + "data": { + "text": chunk, + } + } + r = requests.post(url=url, headers=headers, data=json.dumps(data)) + res = json.loads(r.text) + # print(res) except Exception as e: - logger.warning(chunk) - logger.warning("文档纠错--错别字识别出错\n", e) + userLog.warning(chunk) + userLog.warning("文档纠错--错别字识别出错\n", e) continue - lines_with_greeting = [place for place in res if len(place['errors']) > 0] + lines_with_greeting = [place for place in res["result"] if len(place['errors']) > 0] if len(lines_with_greeting) > 0: num = 0 wenti = [] # 记录问题的数组 @@ -186,18 +176,20 @@ def documentErrorTask(text): for key, value in item['correction'].items(): temp_errorWords.append(key) wenti.append( - "{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords))) + "序号:{},原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords))) num += 1 words = "\n".join(wenti) messages = [{'role': 'user', 'content': [{'text': words + prompt}]}] runList = [] yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送 + userLog.info(f"checkDocumentError----内容解析中...") cishu = 0 for rsp in bot.run(messages): runList.append(rsp) if cishu > 3: cishu = 0 yield "文档纠错---内容解析中" + '.' * cishu + userLog.info(f"checkDocumentError----内容解析中内容解析中" + '.' * cishu) cishu += 1 data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace("\\", "").replace('`', '')) @@ -209,12 +201,13 @@ def documentErrorTask(text): place["jianyi"] = place["解析"] resListerr.append(place) except Exception as e: - logger.warning(parsed_data) - logger.warning(place) - logger.warning("文档纠错--错别字提取出错\n", e) + userLog.warning(parsed_data) + userLog.warning(place) + userLog.warning("文档纠错--错别字提取出错\n", e) continue if (len(resListerr) > 0): err.extend(resListerr) # 打印总份数 yield "文档地名检查---文档解析完成" - yield err \ No newline at end of file + userLog.info(err) + yield err diff --git a/checkPlaceName.py b/checkPlaceName.py index 5b31aa8..851827d 100644 --- a/checkPlaceName.py +++ b/checkPlaceName.py @@ -1,15 +1,15 @@ from docx import Document -from paddlenlp import Taskflow -from pprint import pprint from qwen_agent.agents import Assistant import re import json_repair -import time +import json import math from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml - - +import requests +import logging +from myLogger import outLog +import time def load_from_xml_v2(baseURI, rels_item_xml): """ Return |_SerializedRelationships| instance loaded with the @@ -29,45 +29,10 @@ def load_from_xml_v2(baseURI, rels_item_xml): _SerializedRelationships.load_from_xml = load_from_xml_v2 -import logging -import logging.config - -log_config = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - }, - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'standard', - 'level': logging.INFO, - }, - 'file': { - 'class': 'logging.FileHandler', - 'filename': 'Logger.log', - 'formatter': 'standard', - 'level': logging.INFO, - }, - }, - 'loggers': { - '': { - 'handlers': ['console', 'file'], - 'level': logging.INFO, - 'propagate': True, - }, - } -} - -logging.config.dictConfig(log_config) - -logger = logging.getLogger("checkPlaceName") - +outLog.logger = logging.getLogger("checkPlaceName") +userLog=None prompt=''' -.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; +.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; 不做过多的解释,严格按回答格式作答; ''' # prompt=''' @@ -87,7 +52,18 @@ bot = Assistant(llm=llm_cfg, ) #获取全文内容 def getDocxToTextAll(docxPath): - document = Document(docxPath) + loopCount = 0 + while True: + loopCount+=1 + if(loopCount>=15): + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass # 逐段读取docx文档的内容 levelList=[] words=[] @@ -111,7 +87,7 @@ def getDocxToTextAll(docxPath): #得到全文和地名有关的内容 def placeNameTask(text): yield "文档地名检查---启动中...." - tagTask = Taskflow("ner",device_id=2) + userLog.info("checkPlaceName----启动中....") batchNum=20 sentences = re.split(r'[。\n]', text) # 去掉空字符 @@ -128,16 +104,25 @@ def placeNameTask(text): # 打印每一份的内容 for i, chunk in enumerate(chunks): yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}" - + userLog.info(f"checkPlaceName----文档解析进度:{i + 1}/{num_chunks}") wenBen=".".join(chunk) try: - res = tagTask(wenBen) + url = "http://0.0.0.0:8191/taskflow/checkPlaceName" + headers = {"Content-Type": "application/json"} + data = { + "data": { + "text": wenBen, + } + } + r = requests.post(url=url, headers=headers, data=json.dumps(data)) + res = json.loads(r.text) except Exception as e: - logger.warning(chunk) - logger.warning("文档地名检查---解析地名出错",e) + userLog.warning(chunk) + userLog.warning("文档地名检查---解析地名出错") + userLog.warning(e) continue isplace = False - for zuhe in res: + for zuhe in res["result"]: # 上一个的地名,这一个还是地名,就和上一个相加代替这个 if isplace: name = placeList[len(placeList) - 1] @@ -154,16 +139,22 @@ def placeNameTask(text): isplace = False # 打印总份数 yield "文档地名检查---文档解析完成" + userLog.info("checkPlaceName---文档解析完成") placeList=list(dict.fromkeys(placeList)) yield placeList + #主方法 -def checkPlaceName(filename): +def checkPlaceName(filename,user_id): + global userLog + userLog=outLog.get_queue(user_id,"checkPlaceName") yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送 try: getDocxToTextAll(filename) except Exception as e: - logger.warning(e) + userLog.warning(e) yield "文档地名检查---文档无法打开,请检查文档内容" + userLog.warning("文档地名检查---文档无法打开,请检查文档内容") + outLog.mark_done(user_id,"checkPlaceName") return with open("checkPlaceName.txt", "r",encoding='utf-8') as f: gettext = f.read() @@ -184,6 +175,7 @@ def checkPlaceName(filename): if cishu>3: cishu=0 yield "文档地名检查---结果生成中"+'.'*cishu + userLog.info("checkPlaceName---结果生成中"+'.'*cishu) cishu+=1 data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) @@ -194,10 +186,12 @@ def checkPlaceName(filename): if place['回答'] == '错误': error_places.append(place) except Exception as e: - logger.warning(place) - logger.warning("文档地名检查---组织提出出错",e) + userLog.warning(parsed_data) + userLog.warning(place) + userLog.warning("文档地名检查---组织提出出错") + userLog.warning(e) continue - logger.info(error_places) + userLog.info(error_places) returnInfo = "发现异常地名
" if len(error_places)>0: for t in error_places: @@ -206,7 +200,9 @@ def checkPlaceName(filename): paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","") returnInfo+="原文:" + yuanwen + "
出现异常地名:**" + keyword + "**!请注意" + "
" + userLog.info(returnInfo) yield returnInfo - logger.info(returnInfo) else: - yield "**未发现发现异常地名**" \ No newline at end of file + yield "**未发现发现异常地名**" + userLog.info("未发现发现异常地名") + outLog.mark_done(user_id, "checkPlaceName") \ No newline at end of file diff --git a/checkRepeatText.py b/checkRepeatText.py index 9b462d9..c8688e7 100644 --- a/checkRepeatText.py +++ b/checkRepeatText.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders import TextLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from qwen_agent.agents import Assistant import json_repair -from paddlenlp import Taskflow +import json embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") device_id=0 import re @@ -16,41 +16,11 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml import logging import logging.config +import requests +from myLogger import outLog -log_config = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - }, - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'standard', - 'level': logging.INFO, - }, - 'file': { - 'class': 'logging.FileHandler', - 'filename': 'Logger.log', - 'formatter': 'standard', - 'level': logging.INFO, - }, - }, - 'loggers': { - '': { - 'handlers': ['console', 'file'], - 'level': logging.INFO, - 'propagate': True, - }, - } -} - -logging.config.dictConfig(log_config) - -logger = logging.getLogger("checkRepeatText") - +outLog.logger = logging.getLogger("checkRepeatText") +userLog=None def load_from_xml_v2(baseURI, rels_item_xml): """ Return |_SerializedRelationships| instance loaded with the @@ -110,7 +80,18 @@ def isTitle(paragraph): #寻找标题名称 def findTitleName(docxPath): yield '文档相似性检查----检查是否存在详细设计方案' - document = Document(docxPath) + loopCount = 0 + while True: + loopCount+=1 + if(loopCount>=15): + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass # 逐段读取docx文档的内容 titleWords=[] firstTitle = 0 @@ -161,14 +142,24 @@ def findTitleName(docxPath): runList.append(rsp) data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) - logger.info(parsed_data) if(parsed_data["answer"]=="存在"): yield parsed_data["name"] else: yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较" #获取文档中 详细设计方案 章节的所有内容 def getDocxToText(docxPath,titleName,vector_store_path): - document = Document(docxPath) + loopCount = 0 + while True: + loopCount+=1 + if(loopCount>=15): + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass # 逐段读取docx文档的内容 levelList=[] words=[] @@ -228,7 +219,9 @@ def getDocxToText(docxPath,titleName,vector_store_path): # @app.route('/checkRepeatText/', methods=['GET']) -def checkRepeatText(filename): +def checkRepeatText(filename,user_id): + global userLog + userLog=outLog.get_queue(user_id,"checkRepeatText") yield "文档相似性检查---启动中...." vector_store_path="vector_store"+str(uuid.uuid4()) for titleName in findTitleName(filename): @@ -239,13 +232,11 @@ def checkRepeatText(filename): words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path) except Exception as e: yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败" + userLog.warning(e) + userLog.warning(f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败") + outLog.mark_done(user_id, "checkRepeatText") return # 记录程序开始的时间戳‘ - global device_id - similarity = Taskflow("text_similarity",device_id=3) - # device_id+=1 - # if(device_id>1): - # device_id=0 reslist = [] count = 0 for i in words: @@ -259,12 +250,23 @@ def checkRepeatText(filename): if (textTag.find(tag) >= 0): continue try: - res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) + url = "http://0.0.0.0:8192/taskflow/checkRepeatText" + headers = {"Content-Type": "application/json"} + data = { + "data": { + "text": [[i[i.find(':') + 1:], text[text.find(':') + 1:]]], + } + } + r = requests.post(url=url, headers=headers, data=json.dumps(data)) + res = json.loads(r.text) + # res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) except Exception as e: - logger.warning("文档相似性检查--发生异常:",e) - logger.warning(i) - logger.warning(text) - if (res[0]["similarity"] > 0.90): + userLog.warning("文档相似性检查--发生异常:") + userLog.warning(e) + userLog.warning(i) + userLog.warning(text) + continue + if (res["result"][0]["similarity"] > 0.90): # 判断重复内容是否被放入 if (len(reslist) > 0): isExist = False @@ -274,19 +276,20 @@ def checkRepeatText(filename): break if not isExist: # reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]}) - reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) + userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】") + reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]}) else: - reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) + reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]}) # print(i.split(":")[1] + "\n" + text.split(":")[1]) + userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】") # vectorstore.delete(ids=uuids) shutil.rmtree(vector_store_path) - logger.info("已删除") - logger.info(reslist) resInfo=f"对{titleName}章节,发现相似内容:
" if(len(reslist)>0): for res in reslist: resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"
在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"
以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】
" yield resInfo - logger.info(resInfo) else: - yield "未发现相似内容" + yield "**未发现相似内容**" + userLog.info("文档相似性检查----未发现相似内容**") + outLog.mark_done(user_id, "checkRepeatText") \ No newline at end of file diff --git a/checkTitleName.py b/checkTitleName.py index cfba113..7a0c25b 100644 --- a/checkTitleName.py +++ b/checkTitleName.py @@ -1,3 +1,5 @@ +import time + from docx import Document from pprint import pprint from qwen_agent.agents import Assistant @@ -6,7 +8,7 @@ import json_repair import math from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml - +from myLogger import outLog def load_from_xml_v2(baseURI, rels_item_xml): """ @@ -26,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml): _SerializedRelationships.load_from_xml = load_from_xml_v2 import logging -import logging.config - -log_config = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - }, - }, - 'handlers': { - 'console': { - 'class': 'logging.StreamHandler', - 'formatter': 'standard', - 'level': logging.INFO, - }, - 'file': { - 'class': 'logging.FileHandler', - 'filename': 'Logger.log', - 'formatter': 'standard', - 'level': logging.INFO, - }, - }, - 'loggers': { - '': { - 'handlers': ['console', 'file'], - 'level': logging.INFO, - 'propagate': True, - }, - } -} -logging.config.dictConfig(log_config) - -logger = logging.getLogger("checkCompanyName") +outLog.logger = logging.getLogger("checkTitleName") +userLog=None llm_cfg = { #'model': 'qwen1.5-72b-chat', 'model':"qwen2-72b-instruct", @@ -113,7 +83,18 @@ def isTitle(paragraph): #获取文档中 详细设计方案 章节的所有内容 def getDocxToTitleName(docxPath): - document = Document(docxPath) + loopCount = 0 + while True: + loopCount+=1 + if(loopCount>=15): + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass # 逐段读取docx文档的内容 levelList=[] words=[] @@ -130,9 +111,11 @@ def getDocxToTitleName(docxPath): words.append(text) return words -def checkTitleName(filename): - +def checkTitleName(filename,user_id): + global userLog + userLog=outLog.get_queue(user_id,"checkTitleName") yield '文档结构检查----启动中' + userLog.info("checkTitleName----启动中") with open("ce模板.txt", "r",encoding='utf-8') as f: gettext = f.readlines() count=0 @@ -140,8 +123,10 @@ def checkTitleName(filename): try: word = getDocxToTitleName(filename) except Exception as e: - print(e) - yield "文档无法打开,请检查文档内容" + userLog.warning(e) + yield "文档结构检查----文档无法打开,请检查文档内容" + outLog.mark_done(user_id, "checkTitleName") + userLog.warning("checkTitleName----文档无法打开,请检查文档内容") return for text in gettext: count+=1 @@ -150,24 +135,25 @@ def checkTitleName(filename): ''' xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" yield f"文档结构检查----结构分析中{count}/{len(gettext)}" + userLog.info(f"checkTitleName----结构分析中{count}/{len(gettext)}") strword = "\n".join(word)+prompt+xushang - # print(strword) messages = [{'role': 'user', 'content': [{'text':strword}]}] runList = [] - cishu = 0 for rsp in bot.run(messages): runList.append(rsp) # print(rsp) data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) - print(parsed_data) if(parsed_data["answer"]=="不存在"): reserr.append(text) + resInfo="文档结构存在异常:
" if(len(reserr)>0): for i in reserr: resInfo+="**"+i.replace('\n','')+"**
" - logger.info(resInfo) + userLog.info(resInfo) yield resInfo else: yield "文档结构未发现异常" + userLog.info("文档结构未发现异常") + outLog.mark_done(user_id, "checkTitleName") diff --git a/main.py b/main.py index 33d1f8d..8e89845 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,21 @@ -from flask import Flask, request, jsonify,Response +from flask import Flask, request, jsonify, Response import os from checkPlaceName import checkPlaceName from checkRepeatText import checkRepeatText from checkCompanyName import checkCompanyName -from checkDocumentError import getDocumentError +from checkDocumentError import checkDocumentError from checkTitleName import checkTitleName from flask_cors import CORS import qwen_agenttext +from myLogger import outLog +import time app = Flask(__name__) cros = CORS(app) UPLOAD_FOLDER = 'uploads' -usableTag=[0,0,0,0,0,0,0,0] if not os.path.exists(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) + + @app.route('/upload', methods=['POST']) def upload_file(): if 'file' not in request.files: @@ -22,11 +25,13 @@ def upload_file(): return jsonify({"error": "No selected file"}), 400 if file: filename = file.filename - file.save(os.path.join(UPLOAD_FOLDER,filename)) + file.save(os.path.join(UPLOAD_FOLDER, filename)) return jsonify({"message": "File uploaded successfully"}), 200 -@app.route('/stream' ,methods=["GET", "POST"]) + + +@app.route('/stream', methods=["GET", "POST"]) def stream_numbers(): - context= request.args.get('context') + context = request.args.get('context') # def generate_numbers(): # event_id=0 # for number in range(1, 10): @@ -50,22 +55,26 @@ def stream_numbers(): "Access-Control-Allow-Methods": "GET,POST", "Access-Control-Allow-Headers": "x-requested-with,content-type", } - return Response(qwen_agenttext.getxinx(context),headers=headers) + return Response(qwen_agenttext.getxinx(context), headers=headers) + + @app.route('/sse/checkRepeatText', methods=['GET']) def checkRepeatTextWeb(): filename = request.args.get('filename') + userId = request.args.get("userId") - def generate_checkRepeatText(filename): - id=0 - try: - for i in checkRepeatText(filename): - yield f"id: {id+1}\n" - yield f"event: checkRepeatText\n" - yield f"data: {i}\n\n" # 发送完成信号 - except Exception as e: - yield f"id: {id+1}\n" + def generate_checkRepeatText(filename,userId): + id = 0 + for i in checkRepeatText(filename,userId): + yield f"id: {id + 1}\n" yield f"event: checkRepeatText\n" - yield f"data: **程序出现异常**\n\n" # 发送完成信号 + yield f"data: {i}\n\n" # 发送完成信号 + # except Exception as e: + + # yield f"id: {id+1}\n" + # yield f"event: checkRepeatText\n" + # yield f"data: **程序出现异常**\n\n" # 发送完成信号 + headers = { "Content-Type": "text/event-stream", "Cache-Control": "no-cache", @@ -74,19 +83,20 @@ def checkRepeatTextWeb(): "Access-Control-Allow-Methods": "GET,POST", "Access-Control-Allow-Headers": "x-requested-with,content-type", } - return Response(generate_checkRepeatText(filename), headers=headers) + return Response(generate_checkRepeatText(filename,userId), headers=headers) @app.route('/sse/checkPlaceName', methods=['GET']) def checkPlaceNameWebSse(): filename = request.args.get('filename') - - def generate_checkPlaceName(filename): - id=0 - for i in checkPlaceName(filename): - yield f"id: {id+1}\n" + userId = request.args.get("userId") + def generate_checkPlaceName(filename,userId): + id = 0 + for i in checkPlaceName(filename,userId): + yield f"id: {id + 1}\n" yield f"event: checkPlaceName\n" yield f"data: {i}\n\n" # 发送完成信号 + headers = { "Content-Type": "text/event-stream", "Cache-Control": "no-cache", @@ -95,14 +105,16 @@ def checkPlaceNameWebSse(): "Access-Control-Allow-Methods": "GET,POST", "Access-Control-Allow-Headers": "x-requested-with,content-type", } - return Response(generate_checkPlaceName(filename), headers=headers) + return Response(generate_checkPlaceName(filename,userId), headers=headers) + + @app.route('/sse/checkCompanyName', methods=['GET']) def checkCompanyNameWebSse(): filename = request.args.get('filename') - - def generate_checkCompanyName(filename): + userId = request.args.get("userId") + def generate_checkCompanyName(filename,userId): id = 0 - for i in checkCompanyName(filename): + for i in checkCompanyName(filename,userId): yield f"id: {id + 1}\n" yield f"event: checkCompanyName\n" yield f"data: {i}\n\n" # 发送完成信号 @@ -115,17 +127,18 @@ def checkCompanyNameWebSse(): "Access-Control-Allow-Methods": "GET,POST", "Access-Control-Allow-Headers": "x-requested-with,content-type", } - return Response(generate_checkCompanyName(filename), headers=headers) + return Response(generate_checkCompanyName(filename,userId), headers=headers) + @app.route('/sse/checkDocumentErrorWeb', methods=['GET']) def checkDocumentErrorWebSse(): filename = request.args.get('filename') - - def generate_checkDocumentError(filename): + userId = request.args.get("userId") + def generate_checkDocumentError(filename,userId): id = 0 - for i in getDocumentError(filename): + for i in checkDocumentError(filename,userId): yield f"id: {id + 1}\n" - yield f"event: getDocumentError\n" + yield f"event: checkDocumentError\n" yield f"data: {i}\n\n" # 发送完成信号 headers = { @@ -136,14 +149,16 @@ def checkDocumentErrorWebSse(): "Access-Control-Allow-Methods": "GET,POST", "Access-Control-Allow-Headers": "x-requested-with,content-type", } - return Response(generate_checkDocumentError(filename), headers=headers) + return Response(generate_checkDocumentError(filename,userId), headers=headers) + + @app.route('/sse/checkTitleName', methods=['GET']) def checkTitleNameWebSse(): filename = request.args.get('filename') - - def generate_checkTitleName(filename): + userId = request.args.get("userId") + def generate_checkTitleName(filename,userId): id = 0 - for i in checkTitleName(filename): + for i in checkTitleName(filename,userId): yield f"id: {id + 1}\n" yield f"event: checkTitleName\n" yield f"data: {i}\n\n" # 发送完成信号 @@ -156,6 +171,36 @@ def checkTitleNameWebSse(): "Access-Control-Allow-Methods": "GET,POST", "Access-Control-Allow-Headers": "x-requested-with,content-type", } - return Response(generate_checkTitleName(filename), headers=headers) + return Response(generate_checkTitleName(filename,userId), headers=headers) + +@app.route('/sse/getLog', methods=['GET']) +def getlog(): + userId = request.args.get("userId") + def generate_getLog(userId): + time.sleep(1) + id = 0 + while True: + if outLog.is_done(userId): + break + q = outLog.get_queueData(userId) + if q: + id+=1 + text = q.pop(0) + yield f"id: {id}\n" + yield f"event: getlog\n" + yield f"data: {text}\n\n" # 发送完成信号 + yield f"id: {id}\n" + yield f"event: getlog\n" + yield f"data: 任务结束!!!!!\n\n" # 发送完成信号 + outLog.del_queue(userId) + headers = { + "Content-Type": "text/event-stream", + "Cache-Control": "no-cache", + "X-Accel-Buffering": "no", + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Methods": "GET,POST", + "Access-Control-Allow-Headers": "x-requested-with,content-type", + } + return Response(generate_getLog(userId), headers=headers) if __name__ == '__main__': - app.run(host="0.0.0.0",port=80) \ No newline at end of file + app.run(host="0.0.0.0", port=80) diff --git a/myLogger.py b/myLogger.py new file mode 100644 index 0000000..6ea3059 --- /dev/null +++ b/myLogger.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +""" +@author: bingyl123@163.com +@version: 1.0.0 +@file: OutLog.py +@time: 2023/2/23 20:25 +""" +# import logging +# import logging.config +# import re +# import datetime +# import queue +# +# +# class OutLog: +# _instance = None +# logger = None +# +# def __new__(cls): +# if cls._instance is None: +# cls._instance = super(OutLog, cls).__new__(cls) +# cls.logger = logging.getLogger("app") # 默认logger名称为"app" +# cls._instance.queue_dict = {} +# cls._instance.done_dict = {} +# return cls._instance +# +# def get_queue(self, user_id): +# if user_id not in self.queue_dict: +# self.queue_dict[user_id] = [] +# self.done_dict[user_id] = {} # 初始化为未完成的字典 +# return self.queue_dict[user_id] +# +# def mark_done(self, user_id, producer_name): +# self.done_dict[user_id][producer_name] = True +# +# def is_done(self, user_id): +# return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成 +# @staticmethod +# def put(item: str, level="INFO"): +# dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +# mq.put(f"{dtf}[{level}]: {item}") +# +# @staticmethod +# def debug(item, log=True): +# OutLog.put(item, level="DEBUG") +# if log: +# OutLog._instance.logger.debug(item) +# +# @staticmethod +# def info(item, log=True): +# OutLog.put(item, level="INFO") +# if log: +# OutLog._instance.logger.info(item) +# +# @staticmethod +# def warning(item, log=True): +# OutLog.put(item, level="WARNING") +# if log: +# OutLog._instance.logger.warning(item) +# +# @staticmethod +# def error(item, log=True): +# OutLog.put(item, level="ERROR") +# if log: +# OutLog._instance.logger.error(item) +# +# @staticmethod +# def critical(item, log=True): +# OutLog.put(item, level="CRITICAL") +# if log: +# OutLog._instance.logger.critical(item) +# +# +# +# # 日志配置 +# log_config = { +# 'version': 1, +# 'disable_existing_loggers': False, +# 'formatters': { +# 'standard': { +# 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', +# }, +# }, +# 'handlers': { +# 'console': { +# 'class': 'logging.StreamHandler', +# 'formatter': 'standard', +# 'level': logging.INFO, +# }, +# 'file': { +# 'class': 'logging.FileHandler', +# 'filename': 'Logger.log', +# 'formatter': 'standard', +# 'level': logging.WARNING, +# }, +# }, +# 'loggers': { +# '': { +# 'handlers': ['console', 'file'], +# 'level': logging.WARNING, +# 'propagate': True, +# }, +# } +# } +# +# logging.config.dictConfig(log_config) +# +# outLog = OutLog() # 获取单例实例 + + + +import logging +import logging.config +import datetime + +class OutLog: + _instance = None + logger = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super(OutLog, cls).__new__(cls) + cls.logger = logging.getLogger("app") # 默认logger名称为"app" + cls._instance.queue_dict = {} + cls._instance.done_dict = {} + return cls._instance + + def get_queue(self, user_id,producer_name): + if user_id not in self.queue_dict: + self.queue_dict[user_id] = [] + self.done_dict[user_id] = {} # 初始化为未完成的字典 + if user_id not in self.done_dict: + self.done_dict[user_id][producer_name] = False + return self.UserLogger(user_id) + def get_queueData(self, user_id): + if user_id in self.queue_dict: + return OutLog._instance.queue_dict[self.user_id] + def del_queue(self,user_id): + if self.is_done(user_id): + del self.queue_dict[user_id] + del self.done_dict[user_id] + class UserLogger: + def __init__(self, user_id): + self.user_id = user_id + self.logger = OutLog._instance.logger + + def log(self, item: str, level: str): + dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + log_entry = f"{dtf}[{level}]: {item}" + OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列 + self._log_to_logger(item, level) + + def _log_to_logger(self, item: str, level: str): + if level == "DEBUG": + self.logger.debug(item) + elif level == "INFO": + self.logger.info(item) + elif level == "WARNING": + self.logger.warning(item) + elif level == "ERROR": + self.logger.error(item) + elif level == "CRITICAL": + self.logger.critical(item) + + def info(self, item: str): + self.log(item, "INFO") + + def warning(self, item: str): + self.log(item, "WARNING") + + def debug(self, item: str): + self.log(item, "DEBUG") + + def error(self, item: str): + self.log(item, "ERROR") + + def critical(self, item: str): + self.log(item, "CRITICAL") + + def mark_done(self, user_id, producer_name): + self.done_dict[user_id][producer_name] = True + + def is_done(self, user_id): + return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成 + + +# 日志配置 +log_config = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'standard': { + 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + }, + }, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + 'formatter': 'standard', + 'level': logging.INFO, + }, + 'file': { + 'class': 'logging.FileHandler', + 'filename': 'Logger.log', + 'formatter': 'standard', + 'level': logging.WARNING, + }, + }, + 'loggers': { + '': { + 'handlers': ['console', 'file'], + 'level': logging.WARNING, + 'propagate': True, + }, + } +} + +logging.config.dictConfig(log_config) + +outLog = OutLog() # 获取单例实例 \ No newline at end of file diff --git a/test.py b/test.py index 3bda934..06be8df 100644 --- a/test.py +++ b/test.py @@ -1,109 +1,79 @@ -import time -import json -import math -from flask import Flask,Response,request -from flask_sse import sse -from flask_cors import CORS -import re -import qwen_agenttext -app = Flask(__name__) -cros = CORS(app) -# SSE 推送函数 -import paddle; -paddle.device.get_available_device() +# -*- coding:utf-8 -*- +# from spire.doc import * +# from spire.doc.common import * +# +# # 创建一个 Document 对象 +# document = Document() +# # 加载一个 Word DOCX 文档 +# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc") +# document.LoadFromFile("D:\\数据集\\数据集\\3.doc") +# print(document.Sections.Count) +# for i in range(document.Sections.Count): +# section=document.Sections[i] +# for x in range(section.Paragraphs.Count): +# paragraph=section.Paragraphs[x] +# print(paragraph.Text) +# print("---------------------------------") +# # 或加载一个 Word DOC 文档 +# # document.LoadFromFile("1223.xml") +# +# # # # 设置是否在 HTML 中嵌入图片 +# # document.HtmlExportOptions.ImageEmbedded = True +# # # document.XHTMLValidateOption.ImageEmbedded = True +# # # +# # # # 设置是否将表单字段导出为纯文本在 HTML 中显示 +# # document.HtmlExportOptions.IsTextInputFormFieldAsText = True +# # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True +# # # +# # # # 设置是否在 HTML 中导出页眉和页脚 +# # document.HtmlExportOptions.HasHeadersFooters = False +# # # document.XHTMLValidateOption.HasHeadersFooters = True +# # +# # # 将 Word 文档保存为 HTML 文件 +# # document.SaveToFile("1223.html", FileFormat.Html) +# # # +# document.Close() +from bs4 import BeautifulSoup +# 读取HTML文件 +with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file: + html_content = file.read() +# 解析HTML文档 +soup = BeautifulSoup(html_content, 'html.parser') -# SSE 推送路由 +# 用于存储结果的字典 +headings = {} +current_heading = None +# 遍历所有的h1, h2, h3等标题 +for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]): + level = int(element.name[1]) # 获取标题级别 + title = element.get_text(strip=True) # 获取标题文本 -# @app.route('/register', methods=["GET"]) -# def register(): - # 获取客户端标识符 - # client_id = str(uuid.uuid4()) - # - # # 返回 SSE 响应 - # return jsonify({"client_id": client_id}) + # 设置当前标题 + current_heading = { + 'title': title, + 'level': level, + 'content': [] + } + # 将当前标题添加到字典中 + headings[title] = current_heading -# SSE 推送路由 + # 寻找当前标题下的内容 + next_element = element.find_next_sibling() + while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]: + # 判断内容的标签 + if next_element.name in ['p', 'div']: + current_heading['content'].append(next_element.get_text(strip=False)) + next_element = next_element.find_next_sibling() + +# 输出结果 +for heading in headings.values(): + print(f"标题: {heading['title']} (级别: {heading['level']})") + print("内容:") + for content in heading['content']: + print(f" - {content}") + print() -# @app.route('/sse', methods=['POST']) -# def stream(): -# # 获取客户端标识符 -# client_id = 1 -# print("client_id", client_id) -# -# def aa(): -# # 循环发送 SSE 数据 -# for i in range(10): -# data = 'Hello, %s!' % client_id + str(i) -# print(data) -# sse.publish(data, channel=client_id, type='message') -# time.sleep(1) -# sse.publish("end", channel=client_id, type='message') -# -# # 返回 SSE 响应 -# response = Response(aa(), mimetype='text/event-stream') -# response.headers.add('Cache-Control', 'no-cache') -# response.headers.add('Connection', 'keep-alive') -# response.headers.add('X-Accel-Buffering', 'no') -# return response -# -# -# -# @app.route('/stream' ,methods=["GET", "POST"]) -# def stream_numbers(): -# context= request.args.get('context') -# -# -# headers = { -# "Content-Type": "text/event-stream", -# "Cache-Control": "no-cache", -# "X-Accel-Buffering": "no", -# "Access-Control-Allow-Origin": "*", -# "Access-Control-Allow-Methods": "GET,POST", -# "Access-Control-Allow-Headers": "x-requested-with,content-type", -# } -# return Response(generate_numbers(),headers=headers) -# def generate_numbers(): -# event_id=0 -# # for number in range(1, 10): -# # json_data = json.dumps({"number": number}) -# # print(json_data) -# # event_id += 1 -# # yield f"id: {event_id}\n" -# # yield f"event: time-update\n" -# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 -# json_data = json.dumps({"number": "done"}) -# yield f"id: {1}\n" -# yield f"event: time-update\n" -# yield f"data: 34568\n\n" # 发送完成信号 -# if __name__ == '__main__': -# -# -# # 读取文件内容 -# with open("checkPlaceName.txt", "r", encoding='utf-8') as f: -# gettext = f.read() -# batchNum=20 -# sentences = re.split(r'[。\n]', gettext) -# # 去掉空字符 -# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] -# # 计算总字符数 -# total_chars = len(sentences) -# -# # 计算有多少份 -# num_chunks = math.ceil(total_chars / batchNum) -# -# # 按batchNum字为一份进行处理 -# chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] -# -# # 打印每一份的内容 -# for i, chunk in enumerate(chunks): -# print(f"Chunk {i + 1}:") -# print(chunk) -# print("-" * 40) -# -# # 打印总份数 -# print(f"Total chunks: {num_chunks}") -# app.run(debug=True,port=80) \ No newline at end of file