From 6a406ec64e13dde96c7069d763883c1f5c9a3b6c Mon Sep 17 00:00:00 2001 From: zhouhaibin Date: Thu, 17 Oct 2024 16:12:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- checkCompanyName.py | 92 ++++---- checkDocumentError.py | 90 ++++---- checkPlaceName.py | 13 +- checkRepeatText.py | 190 ++++++++++------ checkTitleName.py | 85 ++++---- daijian方案.py | 489 +++++++++++++++++++++++++++++++++--------- main.py | 444 ++++++++++++++++++++++---------------- myLogger.py | 169 ++++----------- 8 files changed, 963 insertions(+), 609 deletions(-) diff --git a/checkCompanyName.py b/checkCompanyName.py index 4d2f1fd..1735ff2 100644 --- a/checkCompanyName.py +++ b/checkCompanyName.py @@ -8,9 +8,10 @@ import math from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml import requests -from myLogger import outLog +# from myLogger import outLog import time + def load_from_xml_v2(baseURI, rels_item_xml): """ Return |_SerializedRelationships| instance loaded with the @@ -31,9 +32,9 @@ _SerializedRelationships.load_from_xml = load_from_xml_v2 import logging -outLog.logger = logging.getLogger("checkCompanyName") -userLog=None -prompt =''' +# outLog.logger = logging.getLogger("checkCompanyName") +userLog = None +prompt = ''' .根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询, 你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案, 回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; @@ -54,8 +55,8 @@ def getDocxToTextAll(name): docxPath = name loopCount = 0 while True: - loopCount+=1 - if(loopCount>=15): + loopCount += 1 + if (loopCount >= 60): raise Exception("文档读取超时,或文档存在问题无法读取") break try: @@ -76,17 +77,16 @@ def getDocxToTextAll(name): words.append(text) # 将所有段落文本拼接成一个字符串,并用换行符分隔 text = '\n'.join(words) - # userLog.info("checkCompanyName----保存文件") # 将文本写入txt文件 with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file: txt_file.write(text) def companyNameTask(text): - yield "文档公司或组织名称检查---启动中...." - userLog.info("checkCompanyName----启动中....") - batchNum = 20 - sentences = re.split(r'[。\n]', text) + yield "文档公司或组织名称检查---文档解析中...." + userLog.info("文档公司或组织名称检查---任务开始") + batchNum = 5 + sentences = re.split(r'[、,。\n]', text) # 去掉空字符 sentences = [sentence.strip() for sentence in sentences if sentence.strip()] # 计算总字符数 @@ -101,19 +101,19 @@ def companyNameTask(text): # 打印每一份的内容 for i, chunk in enumerate(chunks): yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}" - userLog.info(f"checkCompanyName----文档解析进度:{i + 1}/{num_chunks}") try: - wenBen = ".".join(chunk) - url = "http://0.0.0.0:8191/taskflow/checkPlaceName" + # wenBen = ".".join(chunk) + url = "http://0.0.0.0:8191/taskflow/checkPlaceNameServer" headers = {"Content-Type": "application/json"} data = { "data": { - "text": wenBen, + "text": chunk, + # "text":wenBen } } r = requests.post(url=url, headers=headers, data=json.dumps(data)) res = json.loads(r.text) - # userLog.info(res) + res = res["data"] # print(res) except Exception as e: userLog.warning(chunk) @@ -121,44 +121,52 @@ def companyNameTask(text): userLog.warning(e) return isplace = False - for zuhe in res["result"]: + + # for zuhe in res: + # # 上一个的地名,这一个还是地名,就和上一个相加代替这个 + # if isplace: + # name = placeList[len(placeList) - 1] + # if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns" + # isplace = True + # new_text = zuhe[0].replace("\n", "") + # placeList[len(placeList) - 1] = name + new_text + # continue + # if zuhe[1].find("组织机构类") >= 0: + # isplace = True + # new_text = zuhe[0].replace("\n", "") + # placeList.append(new_text) + # else: + # isplace = False + ##案例[[('目前', 'TIME'), ('江北区历史文化档案馆', 'ORG')], [('宁波国研简直,并且在东软', 'ORG'), ('宁波市北仑区教育局', 'ORG'), ('国研信息', 'ORG'), ('浙江省', 'LOC'), ('宁波市金凤区', 'LOC'), ('金凤区', 'LOC')]] + for zuhe in res: # 上一个的地名,这一个还是地名,就和上一个相加代替这个 - if isplace: - name = placeList[len(placeList) - 1] - if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns" - isplace = True - new_text = zuhe[0].replace("\n", "") - placeList[len(placeList) - 1] = name + new_text - continue - if zuhe[1].find("组织机构类") >= 0: - isplace = True - new_text = zuhe[0].replace("\n", "") - placeList.append(new_text) - else: - isplace = False + for chid in zuhe: + if (chid[1] == "ORG"): + new_text = chid[0].replace("\n", "") + placeList.append(new_text) # 打印总份数 yield "文档公司或组织名称检查---文档解析完成" - userLog.info("checkCompanyName----文档解析完成") placeList = list(dict.fromkeys(placeList)) + userLog.debug(placeList) yield placeList - userLog.info(placeList) -def checkCompanyName(filename,user_id): + +def checkCompanyName(filename, user_id, outLog): yield f"文档公司或组织名称检查---开始处理文档..." global userLog - userLog=outLog.get_queue(user_id, "checkCompanyName") + userLog = outLog.get_queue(user_id, "checkCompanyName") try: getDocxToTextAll(filename) except Exception as e: userLog.warning(e) userLog.warning("文档公司或组织名称检查---文档无法打开,请检查文档内容") - yield "文档公司或组织名称检查---文档无法打开,请检查文档内容" + yield "文档公司或组织名称检查---文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。" outLog.mark_done(user_id, "checkCompanyName") return with open("checkCompanyName.txt", "r", encoding='utf-8') as f: gettext = f.read() yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送 - userLog.info("checkCompanyName----开始解析文档...") + final_list = "" for item in companyNameTask(gettext): if isinstance(item, str): yield item @@ -174,7 +182,6 @@ def checkCompanyName(filename,user_id): if cishu > 3: cishu = 0 yield "文档公司或组织名称检查---结果生成中" + '.' * cishu - userLog.info(f"checkCompanyName----结果生成中" + '.' * cishu) cishu += 1 data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) @@ -182,14 +189,15 @@ def checkCompanyName(filename,user_id): for place in parsed_data: try: - if place['回答'] == '非泛化的公司或组织名称': + if place['回答'] == '具体的公司或组织名称': + if (place["companyName"] == "北京国研科技咨询有限公司浙江分公司"): + continue error_places.append(place) except Exception as e: userLog.warning(place) userLog.warning(e) userLog.warning("文档公司或组织名称检查---组织提出出错") continue - userLog.info(error_places) returnInfo = "发现异常公司或组织名称
" if len(error_places) > 0: for t in error_places: @@ -199,9 +207,9 @@ def checkCompanyName(filename,user_id): t["yuanwen"] = paragraphs[0] yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "") returnInfo += "原文:" + yuanwen + "
异常公司或组织名称:**" + keyword + "**!请注意" + "
" - userLog.info(returnInfo) + userLog.info("文档公司或组织名称检查---原文:" + yuanwen + "异常公司或组织名称:" + keyword + "!请注意") yield returnInfo else: yield "**未发现异常公司或组织名称**
" - userLog.info("**未发现异常公司或组织名称**
") - outLog.mark_done(user_id, "checkCompanyName") \ No newline at end of file + userLog.info("文档公司或组织名称检查---未发现异常公司或组织名称") + outLog.mark_done(user_id, "checkCompanyName") diff --git a/checkDocumentError.py b/checkDocumentError.py index 33d7ed4..8728136 100644 --- a/checkDocumentError.py +++ b/checkDocumentError.py @@ -8,7 +8,7 @@ import math from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml import requests -from myLogger import outLog +# from myLogger import outLog import time def load_from_xml_v2(baseURI, rels_item_xml): """ @@ -27,9 +27,9 @@ def load_from_xml_v2(baseURI, rels_item_xml): _SerializedRelationships.load_from_xml = load_from_xml_v2 -import logging +# import logging -outLog.logger = logging.getLogger("checkDocumentError") +# outLog.logger = logging.getLogger("checkDocumentError") userLog=None llm_cfg = { # 'model': 'qwen1.5-72b-chat', @@ -40,7 +40,7 @@ llm_cfg = { bot = Assistant(llm=llm_cfg, name='Assistant', # description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' - + system_message="你是一个错别字分析大师" ) # prompt=''' # 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案, @@ -48,25 +48,25 @@ bot = Assistant(llm=llm_cfg, # ''' prompt = ''' 请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出详细的解析,没有错则不用给解析 -回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答; +回答格式请按照以下json格式[{"placeName":"序号值","回答":"答案","解析","解析内容"},{"placeName":"序号值","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答; ''' def getDocxToTextAll(name): - userLog.info("checkDocumentError----打开文档") docxPath = name loopCount = 0 - while True: - loopCount+=1 - if(loopCount>=15): - raise Exception("文档读取超时,或文档存在问题无法读取") - break - try: - document = Document(docxPath) - break - except Exception as e: - time.sleep(1) - pass + document = Document(docxPath) + # while True: + # loopCount+=1 + # if(loopCount>=60): + # raise Exception("文档读取超时,或文档存在问题无法读取") + # break + # try: + # document = Document(docxPath) + # break + # except Exception as e: + # time.sleep(1) + # pass # 逐段读取docx文档的内容 words = [] for paragraph in document.paragraphs: @@ -84,23 +84,21 @@ def getDocxToTextAll(name): txt_file.write(text) -def checkDocumentError(filename,user_id): +def checkDocumentError(filename,user_id,outLog): global userLog userLog=outLog.get_queue(user_id,"checkDocumentError") yield f"文档纠错---开始处理文档..." - userLog.info("checkDocumentError----开始处理文档...") try: getDocxToTextAll(filename) except Exception as e: userLog.warning(e) userLog.warning("文档纠错----文档无法打开,请检查文档内容") - yield "文档纠错----文档无法打开,请检查文档内容" + yield "文档纠错----文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。" outLog.mark_done(user_id, "checkDocumentError") return with open("checkDocumentError.txt", "r", encoding='utf-8') as f: gettext = f.read() yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送 - userLog.info("checkDocumentError----开始解析文档...") final_list = [] for item in documentErrorTask(gettext): if isinstance(item, str): @@ -113,12 +111,11 @@ def checkDocumentError(filename,user_id): yuanwen = i["placeName"].replace("\n", "") jianyi = i["jianyi"].replace("\n", "") resInfo += "原文:" + yuanwen + "
建议:**" + jianyi + "**
" - userLog.info(resInfo) yield resInfo else: yield "**未发现错别字**" - userLog.info("未发现错别字") + userLog.info("文档纠错---未发现错别字") outLog.mark_done(user_id,"checkDocumentError") @@ -129,27 +126,33 @@ def documentErrorTask(text): :param batch_size: 每批处理的字符数 :return: 生成器,每次返回一批文本 """ - yield "文档纠错---启动中...." - userLog.info("checkDocumentError----启动中....") + yield "文档纠错---文档解析中...." + userLog.info("文档纠错---任务开始") batchNum = 20 sentences = re.split(r'[。\n]', text) # 去掉空字符 sentences = [sentence.strip() for sentence in sentences if sentence.strip()] # 计算总字符数 total_chars = len(sentences) - # 计算有多少份 num_chunks = math.ceil(total_chars / batchNum) - # 按batchNum字为一份进行处理 chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] # 打印每一份的内容 err = [] for i, chunk in enumerate(chunks): yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}" - userLog.info(f"checkDocumentError----文档解析进度:{i + 1}/{num_chunks}") try: - url = "http://0.0.0.0:8190/taskflow/checkDocumentError" + # url = "http://0.0.0.0:8190/taskflow/checkDocumentError" + # headers = {"Content-Type": "application/json"} + # data = { + # "data": { + # "text": chunk, + # } + # } + # r = requests.post(url=url, headers=headers, data=json.dumps(data)) + # res = json.loads(r.text) + url = "http://127.0.0.1:5001/taskflow/checkDocumentError" headers = {"Content-Type": "application/json"} data = { "data": { @@ -158,12 +161,13 @@ def documentErrorTask(text): } r = requests.post(url=url, headers=headers, data=json.dumps(data)) res = json.loads(r.text) - # print(res) except Exception as e: userLog.warning(chunk) - userLog.warning("文档纠错--错别字识别出错\n", e) + userLog.warning("文档纠错--错别字识别出错\n") + userLog.warning(e) continue - lines_with_greeting = [place for place in res["result"] if len(place['errors']) > 0] + lines_with_greeting = [place for place in res["data"] if len(place['errors']) > 0] + userLog.debug(lines_with_greeting) if len(lines_with_greeting) > 0: num = 0 wenti = [] # 记录问题的数组 @@ -173,26 +177,28 @@ def documentErrorTask(text): keyword = t['source'] keyword_list.append(keyword) for item in t["errors"]: - for key, value in item['correction'].items(): - temp_errorWords.append(key) + # for key, value in item['correction'].items(): + # temp_errorWords.append(key) + temp_errorWords.append(item[0]) wenti.append( - "序号:{},原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords))) + # "{}:原文是{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords))) + "{}:原文是{}。问题:当前原文是否存在错别字,只检查错被子,其他不做分析".format(num, keyword)) num += 1 words = "\n".join(wenti) + userLog.debug(words) messages = [{'role': 'user', 'content': [{'text': words + prompt}]}] runList = [] yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送 - userLog.info(f"checkDocumentError----内容解析中...") cishu = 0 for rsp in bot.run(messages): runList.append(rsp) if cishu > 3: cishu = 0 yield "文档纠错---内容解析中" + '.' * cishu - userLog.info(f"checkDocumentError----内容解析中内容解析中" + '.' * cishu) cishu += 1 data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace("\\", "").replace('`', '')) + userLog.debug(parsed_data) resListerr = [] for place in parsed_data: try: @@ -200,14 +206,16 @@ def documentErrorTask(text): place["placeName"] = keyword_list[int(place["placeName"])] place["jianyi"] = place["解析"] resListerr.append(place) + userLog.info("文档纠错---原文:" + place["placeName"] + "
建议:" + place["jianyi"]) except Exception as e: userLog.warning(parsed_data) userLog.warning(place) - userLog.warning("文档纠错--错别字提取出错\n", e) + userLog.warning("文档纠错--错别字提取出错\n") + userLog.warning(e) continue if (len(resListerr) > 0): err.extend(resListerr) # 打印总份数 - yield "文档地名检查---文档解析完成" - userLog.info(err) - yield err + yield "文档纠错---文档解析完成" + userLog.info("文档纠错---任务结束") + yield err \ No newline at end of file diff --git a/checkPlaceName.py b/checkPlaceName.py index 851827d..5c69bc3 100644 --- a/checkPlaceName.py +++ b/checkPlaceName.py @@ -87,7 +87,6 @@ def getDocxToTextAll(docxPath): #得到全文和地名有关的内容 def placeNameTask(text): yield "文档地名检查---启动中...." - userLog.info("checkPlaceName----启动中....") batchNum=20 sentences = re.split(r'[。\n]', text) # 去掉空字符 @@ -104,7 +103,6 @@ def placeNameTask(text): # 打印每一份的内容 for i, chunk in enumerate(chunks): yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}" - userLog.info(f"checkPlaceName----文档解析进度:{i + 1}/{num_chunks}") wenBen=".".join(chunk) try: url = "http://0.0.0.0:8191/taskflow/checkPlaceName" @@ -139,7 +137,6 @@ def placeNameTask(text): isplace = False # 打印总份数 yield "文档地名检查---文档解析完成" - userLog.info("checkPlaceName---文档解析完成") placeList=list(dict.fromkeys(placeList)) yield placeList @@ -175,7 +172,6 @@ def checkPlaceName(filename,user_id): if cishu>3: cishu=0 yield "文档地名检查---结果生成中"+'.'*cishu - userLog.info("checkPlaceName---结果生成中"+'.'*cishu) cishu+=1 data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) @@ -186,12 +182,11 @@ def checkPlaceName(filename,user_id): if place['回答'] == '错误': error_places.append(place) except Exception as e: - userLog.warning(parsed_data) userLog.warning(place) + userLog.warning(parsed_data) userLog.warning("文档地名检查---组织提出出错") userLog.warning(e) continue - userLog.info(error_places) returnInfo = "发现异常地名
" if len(error_places)>0: for t in error_places: @@ -200,9 +195,9 @@ def checkPlaceName(filename,user_id): paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","") returnInfo+="原文:" + yuanwen + "
出现异常地名:**" + keyword + "**!请注意" + "
" - userLog.info(returnInfo) + userLog.info("文档地名检查---原文:" + yuanwen + "出现异常地名:" + keyword + "!请注意") yield returnInfo else: yield "**未发现发现异常地名**" - userLog.info("未发现发现异常地名") - outLog.mark_done(user_id, "checkPlaceName") \ No newline at end of file + userLog.info("文档地名检查---未发现发现异常地名") + outLog.mark_done(user_id, "checkPlaceName") \ No newline at end of file diff --git a/checkRepeatText.py b/checkRepeatText.py index c8688e7..ee5309e 100644 --- a/checkRepeatText.py +++ b/checkRepeatText.py @@ -7,6 +7,7 @@ from qwen_agent.agents import Assistant import json_repair import json embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") +# embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese",model_kwargs={"device":"npu:5"}) device_id=0 import re import time @@ -17,9 +18,9 @@ from docx.opc.oxml import parse_xml import logging import logging.config import requests -from myLogger import outLog +# from myLogger import outLog -outLog.logger = logging.getLogger("checkRepeatText") +# outLog.logger = logging.getLogger("checkRepeatText") userLog=None def load_from_xml_v2(baseURI, rels_item_xml): """ @@ -79,11 +80,10 @@ def isTitle(paragraph): #寻找标题名称 def findTitleName(docxPath): - yield '文档相似性检查----检查是否存在详细设计方案' loopCount = 0 while True: loopCount+=1 - if(loopCount>=15): + if(loopCount>=60): raise Exception("文档读取超时,或文档存在问题无法读取") break try: @@ -95,9 +95,19 @@ def findTitleName(docxPath): # 逐段读取docx文档的内容 titleWords=[] firstTitle = 0 + firstTitleName="" secondTitle = 0 sanjiTitle = 0 + levelText="" + count = 0 + numid =0 + wordContent={} + total = len(document.paragraphs) + addStart = False#是否重新添加 + yield "文档相似性检查----文档内容解析中",str(count),str(total) for paragraph in document.paragraphs: + count+=1 + yield "文档相似性检查----文档内容解析中",str(count),str(total) # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 text = paragraph.text @@ -109,6 +119,8 @@ def findTitleName(docxPath): if(text.find("附件")>=0): continue titleWords.append("一级标题:".format(firstTitle)+text) + addStart=True + firstTitleName=text elif level=="1": secondTitle+=1 sanjiTitle=0 @@ -118,15 +130,28 @@ def findTitleName(docxPath): sanjiTitle += 1 # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) # titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) + ##先判断是不是一级标题 + if addStart: + wordContent[firstTitleName]=[] + addStart=False + if level: + levelText=f"{int(level)+1}级标题-"+text + else: + if(text.startswith("图") or text.startswith("注:")): + continue + if (len(text)>30 and firstTitleName): + numid+=1 + wordContent[firstTitleName].append("{}:".format(levelText)+text) findTitleName_llm_cfg = { #'model': 'qwen1.5-72b-chat', 'model':"qwen2-72b", 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', } + yield '文档相似性检查----检查是否存在详细设计方案' findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, name='Assistant', - # system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' + system_message='按照要求选择最合适的,是唯一的' ) prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择 @@ -142,60 +167,78 @@ def findTitleName(docxPath): runList.append(rsp) data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) - if(parsed_data["answer"]=="存在"): - yield parsed_data["name"] - else: - yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较" + try: + if(parsed_data["answer"]=="存在"): + yield parsed_data["name"],wordContent + else: + yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较" + except Exception as e: + userLog.warning(e) + userLog.warning(data) + userLog.warning(parsed_data) + yield "文档相似性检查----检查遇到问题,请联系管理员" #获取文档中 详细设计方案 章节的所有内容 -def getDocxToText(docxPath,titleName,vector_store_path): - loopCount = 0 - while True: - loopCount+=1 - if(loopCount>=15): - raise Exception("文档读取超时,或文档存在问题无法读取") - break - try: - document = Document(docxPath) - break - except Exception as e: - time.sleep(1) - pass - # 逐段读取docx文档的内容 - levelList=[] +# def getDocxToText(docxPath,titleName,vector_store_path): +def getDocxToText(titleName,wordContent,vector_store_path): + + # loopCount = 0 + # while True: + # loopCount+=1 + # if(loopCount>=15): + # raise Exception("文档读取超时,或文档存在问题无法读取") + # break + # try: + # document = Document(docxPath) + # break + # except Exception as e: + # time.sleep(1) + # pass + # # 逐段读取docx文档的内容 + # levelList=[] words=[] - addStart = False - levelText="" - i = 0 - for paragraph in document.paragraphs: - # 判断该段落的标题级别 - # 这里用isTitle()临时代表,具体见下文介绍的方法 - text = paragraph.text - if text.strip():#非空判断 - if titleName: - level = isTitle(paragraph) - if(addStart and level=="0"): - addStart=False - if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)): - addStart=True - if level: - levelList.append("{}:".format(level)+paragraph.text) - levelText=f"{int(level)+1}级标题-"+text - else: - if addStart: - if(text.startswith("图") or text.startswith("注:")): - continue - if(len(text)>30): - i=i+1 - words.append("{}:".format(levelText)+text) + # addStart = False + # levelText="" + # i = 0 + # count = 0 + # total = len(document.paragraphs) + # yield "文档相似性检查----文档内容解析中",count,total + # for paragraph in document.paragraphs: + # count+=1 + # yield "文档相似性检查----文档内容解析中",count,total + # # 判断该段落的标题级别 + # # 这里用isTitle()临时代表,具体见下文介绍的方法 + # text = paragraph.text + # if text.strip():#非空判断 + # if titleName: + # level = isTitle(paragraph) + # if(addStart and level=="0"): + # addStart=False + # if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)): + # addStart=True + # if level: + # levelList.append("{}:".format(level)+paragraph.text) + # levelText=f"{int(level)+1}级标题-"+text + # else: + # if addStart: + # if(text.startswith("图") or text.startswith("注:")): + # continue + # if(len(text)>30): + # i=i+1 + # words.append("{}:".format(levelText)+text) # 将所有段落文本拼接成一个字符串,并用换行符分隔 + # 遍历字典,查找包含 "标题的" 的键 + for key, value in wordContent.items(): + if (titleName.find(key)>=0 or key.find(titleName)>=0): + words.extend(value) # 将对应的值添加 if len(words)==0: raise Exception("checkRepeatText,获取长度为0") text = '\n'.join(words) - + userLog.info(f"文档相似性检查----需要处理的总数是{len(words)}") # 将文本写入txt文件 with open("checkRepeatText.txt", 'w', ) as txt_file: txt_file.write(text) - time.sleep(3) + time.sleep(1) + yield "文档相似性检查----文档内容转换中",".","." loader = TextLoader(file_path='checkRepeatText.txt') docs = loader.load() # print(docs) @@ -204,44 +247,56 @@ def getDocxToText(docxPath,titleName,vector_store_path): splits = text_splitter.split_documents(docs) uuids = [] + yield "文档相似性检查----文档保存中",".","." + global embeddings + vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) for i in range(len(splits)): - uuids.append(str(uuid.uuid4())) + uuidStr=str(uuid.uuid4()) + uuids.append(uuidStr) logging.info(f"checkRepeatTextuuidLen{len(uuids)}") - vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) vectorstore.add_documents(documents=splits, ids=uuids) + yield "文档相似性检查----校验文档是否已经完成保存",".","." while True: time.sleep(0.3) ress = vectorstore.similarity_search(words[0]) if (len(ress) > 0): break - return words,uuids,vectorstore + yield words,uuids,vectorstore # @app.route('/checkRepeatText/', methods=['GET']) -def checkRepeatText(filename,user_id): +def checkRepeatText(filename,user_id,outLog): global userLog userLog=outLog.get_queue(user_id,"checkRepeatText") yield "文档相似性检查---启动中...." + userLog.info("文档相似性检查---任务开始") vector_store_path="vector_store"+str(uuid.uuid4()) for titleName in findTitleName(filename): - yield titleName - if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"): + if(isinstance(titleName ,tuple)): + if(len(titleName)==3): + yield titleName[0]+titleName[1]+"/"+titleName[2] + else: + yield titleName + if(isinstance(titleName ,tuple)): + # try: + yield "文档相似性检查----文档内容转换中" try: - yield "文档相似性检查----文档内容解析中" - words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path) + for words,uuids,vectorstore in getDocxToText(titleName[0],titleName[1],vector_store_path): + if isinstance(words, str): + yield words+uuids+vectorstore except Exception as e: - yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败" + yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。" userLog.warning(e) userLog.warning(f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败") outLog.mark_done(user_id, "checkRepeatText") return - # 记录程序开始的时间戳‘ + # 记录程序开始的时间戳‘ reslist = [] count = 0 for i in words: count += 1 - yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}" + yield f"文档相似性检查--对{titleName[0]}章节,进行文档内容检查中{count}/{len(words)}" result = vectorstore.similarity_search(i) textTag = i.split(":")[0] for content in result: @@ -259,6 +314,7 @@ def checkRepeatText(filename,user_id): } r = requests.post(url=url, headers=headers, data=json.dumps(data)) res = json.loads(r.text) + res=res["data"] # res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) except Exception as e: userLog.warning("文档相似性检查--发生异常:") @@ -266,7 +322,7 @@ def checkRepeatText(filename,user_id): userLog.warning(i) userLog.warning(text) continue - if (res["result"][0]["similarity"] > 0.90): + if (res[0]["similarity"] >= 0.96): # 判断重复内容是否被放入 if (len(reslist) > 0): isExist = False @@ -276,15 +332,15 @@ def checkRepeatText(filename,user_id): break if not isExist: # reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]}) - userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】") - reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]}) + userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res[0]["similarity"])+"】") + reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) else: - reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]}) + reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) # print(i.split(":")[1] + "\n" + text.split(":")[1]) - userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】") + userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res[0]["similarity"])+"】") # vectorstore.delete(ids=uuids) shutil.rmtree(vector_store_path) - resInfo=f"对{titleName}章节,发现相似内容:
" + resInfo=f"对{titleName[0]}章节,发现相似内容:
" if(len(reslist)>0): for res in reslist: resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"
在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"
以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】
" diff --git a/checkTitleName.py b/checkTitleName.py index 7a0c25b..d2eee5f 100644 --- a/checkTitleName.py +++ b/checkTitleName.py @@ -8,7 +8,9 @@ import json_repair import math from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml -from myLogger import outLog + + +# from myLogger import outLog def load_from_xml_v2(baseURI, rels_item_xml): """ @@ -29,11 +31,11 @@ def load_from_xml_v2(baseURI, rels_item_xml): _SerializedRelationships.load_from_xml = load_from_xml_v2 import logging -outLog.logger = logging.getLogger("checkTitleName") -userLog=None +# outLog.logger = logging.getLogger("checkTitleName") +userLog = None llm_cfg = { - #'model': 'qwen1.5-72b-chat', - 'model':"qwen2-72b-instruct", + # 'model': 'qwen1.5-72b-chat', + 'model': "qwen2-72b-instruct", 'model_server': 'DashScope', # base_url, also known as api_base 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', } @@ -81,12 +83,13 @@ def isTitle(paragraph): # 如果在段落、样式里都没有找到大纲级别,返回None return None -#获取文档中 详细设计方案 章节的所有内容 + +# 获取文档中 详细设计方案 章节的所有内容 def getDocxToTitleName(docxPath): loopCount = 0 while True: - loopCount+=1 - if(loopCount>=15): + loopCount += 1 + if (loopCount >= 60): raise Exception("文档读取超时,或文档存在问题无法读取") break try: @@ -96,64 +99,72 @@ def getDocxToTitleName(docxPath): time.sleep(1) pass # 逐段读取docx文档的内容 - levelList=[] - words=[] + levelList = [] + words = [] addStart = False - levelText="" - i = 0 + levelText = "" + count = 0 + total = len(document.paragraphs) + yield f"文档结构检查----文档内容解析中{str(count)}/{str(total)}" for paragraph in document.paragraphs: + count += 1 + yield f"文档结构检查----文档内容解析中{str(count)}/{str(total)}" # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 text = paragraph.text - if text.strip():#非空判断 + if text.strip(): # 非空判断 level = isTitle(paragraph) - if level=="0": + if level == "0": words.append(text) - return words + yield words -def checkTitleName(filename,user_id): + +def checkTitleName(filename, user_id, outLog): global userLog - userLog=outLog.get_queue(user_id,"checkTitleName") + userLog = outLog.get_queue(user_id, "checkTitleName") yield '文档结构检查----启动中' - userLog.info("checkTitleName----启动中") - with open("ce模板.txt", "r",encoding='utf-8') as f: + userLog.info("文档结构检查---任务开始") + with open("ce模板.txt", "r", encoding='utf-8') as f: gettext = f.readlines() - count=0 + count = 0 reserr = [] try: - word = getDocxToTitleName(filename) + for i in getDocxToTitleName(filename): + word = i + if (isinstance(word, str)): + yield word + continue except Exception as e: userLog.warning(e) - yield "文档结构检查----文档无法打开,请检查文档内容" - outLog.mark_done(user_id, "checkTitleName") + yield "文档结构检查----文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。" userLog.warning("checkTitleName----文档无法打开,请检查文档内容") + outLog.mark_done(user_id, "checkTitleName") return for text in gettext: - count+=1 + count += 1 prompt = f''' \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 ''' - xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" + xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" yield f"文档结构检查----结构分析中{count}/{len(gettext)}" - userLog.info(f"checkTitleName----结构分析中{count}/{len(gettext)}") - strword = "\n".join(word)+prompt+xushang - messages = [{'role': 'user', 'content': [{'text':strword}]}] + strword = "\n".join(word) + prompt + xushang + messages = [{'role': 'user', 'content': [{'text': strword}]}] runList = [] for rsp in bot.run(messages): runList.append(rsp) # print(rsp) data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) - if(parsed_data["answer"]=="不存在"): + if (parsed_data["answer"] == "不存在"): reserr.append(text) - - resInfo="文档结构存在异常:
" - if(len(reserr)>0): + userLog.info("文档结构检查----文档结构存在异常:" + text.replace('\n', '')) + resInfo = "文档结构存在异常:
" + if (len(reserr) > 0): for i in reserr: - resInfo+="**"+i.replace('\n','')+"**
" - userLog.info(resInfo) + resInfo += "**" + i.replace('\n', '') + "**
" + yield resInfo else: - yield "文档结构未发现异常" - userLog.info("文档结构未发现异常") - outLog.mark_done(user_id, "checkTitleName") + yield "**文档结构未发现异常**" + userLog.info("文档结构检查----文档结构未发现异常") + outLog.mark_done(user_id, "checkTitleName") diff --git a/daijian方案.py b/daijian方案.py index 19badae..5210e54 100644 --- a/daijian方案.py +++ b/daijian方案.py @@ -1,11 +1,24 @@ -from docx import Document -from pprint import pprint +import uuid +from langchain_community.embeddings import DashScopeEmbeddings +from langchain_community.document_loaders import TextLoader +from langchain_text_splitters import RecursiveCharacterTextSplitter from qwen_agent.agents import Assistant -import re import json_repair -import math +import json +embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") +device_id=0 +import re +import time +from docx import Document +import shutil from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml +import logging +import logging.config +import requests +from collections import defaultdict + +userLog=None def load_from_xml_v2(baseURI, rels_item_xml): """ Return |_SerializedRelationships| instance loaded with the @@ -23,17 +36,6 @@ def load_from_xml_v2(baseURI, rels_item_xml): _SerializedRelationships.load_from_xml = load_from_xml_v2 -llm_cfg = { - #'model': 'qwen1.5-72b-chat', - 'model':"qwen2-72b-instruct", - 'model_server': 'DashScope', # base_url, also known as api_base - 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', -} -bot = Assistant(llm=llm_cfg, - name='Assistant', - ) - - # 记录程序开始的时间戳 def getOutlineLevel(inputXml): """ @@ -73,15 +75,26 @@ def isTitle(paragraph): # 如果在段落、样式里都没有找到大纲级别,返回None return None -#获取文档中 详细设计方案 章节的所有内容 -def getDocxToTitleName(docxPath): - document = Document(docxPath) +#寻找标题名称 +def findTitleName(docxPath): + yield '文档相似性检查----检查是否存在详细设计方案' + loopCount = 0 + while True: + loopCount+=1 + if(loopCount>=15): + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass # 逐段读取docx文档的内容 - levelList=[] - words=[] - addStart = False - levelText="" - i = 0 + titleWords=[] + firstTitle = 0 + secondTitle = 0 + sanjiTitle = 0 for paragraph in document.paragraphs: # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 @@ -89,88 +102,360 @@ def getDocxToTitleName(docxPath): if text.strip():#非空判断 level = isTitle(paragraph) if level=="0": - words.append(text) - return words - -def checkTitleName(filename): - prompt = f''' - \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 - ''' - xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" - yield f"文档结构检查----结构分析中{count}/{len(gettext)}" - strword = "\n".join(word) + prompt + xushang - # print(strword) - messages = [{'role': 'user', 'content': [{'text': strword}]}] - runList = [] - cishu = 0 - for rsp in bot.run(messages): + firstTitle+=1 + secondTitle = 0 + if(text.find("附件")>=0): + continue + titleWords.append("一级标题:".format(firstTitle)+text) + elif level=="1": + secondTitle+=1 + sanjiTitle=0 + # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) + # titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text) + elif level=="2": + sanjiTitle += 1 + # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) + # titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) + findTitleName_llm_cfg = { + #'model': 'qwen1.5-72b-chat', + 'model':"qwen2-72b", + 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base + # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', + } + findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, + name='Assistant', + # system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' + ) + prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 + 类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择 + 类似设计方案,服务方案,建设方案为次相关,次级选择 + 类似方案是最后选择 + 按照这样的顺序选择最合适的 + 你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答 + ''' + # print("\n".join(titleWords)+prompt) + messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})] + runList=[] + for rsp in findTitleName_bot.run(messages): runList.append(rsp) - # print(rsp) data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) - print(parsed_data) - # yield '文档结构检查----启动中' - # with open("ce模板.txt", "r",encoding='utf-8') as f: - # gettext = f.readlines() - # count=0 - # reserr = [] - # try: - # word = getDocxToTitleName(filename) - # except Exception as e: - # print(e) - # yield "文档无法打开,请检查文档内容" - # return - # for text in gettext: - # count+=1 - # prompt = f''' - # \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 - # ''' - # xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" - # yield f"文档结构检查----结构分析中{count}/{len(gettext)}" - # strword = "\n".join(word)+prompt+xushang - # # print(strword) - # messages = [{'role': 'user', 'content': [{'text':strword}]}] - # runList = [] - # cishu = 0 - # for rsp in bot.run(messages): - # runList.append(rsp) - # # print(rsp) - # data = runList[len(runList) - 1][0]["content"] - # parsed_data = json_repair.loads(data.replace('`', '')) - # print(parsed_data) - # if(parsed_data["answer"]=="不存在"): - # reserr.append(text) - # resInfo="文档结构存在异常:
" - # if(len(reserr)>0): - # for i in reserr: - # resInfo+=f"**{i}**
" - # yield resInfo - # else: - # yield "文档结构未发现异常" + if(parsed_data["answer"]=="存在"): + yield parsed_data["name"] + else: + yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较" +def merge_chapters(words): + merged_text = {} + for line in words: + if ":" in line: + key, value = line.split(":", 1) # 根据第一个冒号分割 + if key in merged_text: + merged_text[key].append(value.strip()) # 添加到列表 + else: + merged_text[key] = [value.strip()] # 初始化列表 + else: + logging.warning(f"Skipping line without key-value pair: {line}") -import logging + # 合并结果格式化为列表输出 + merged_words = [] + for key, values in merged_text.items(): + combined_value = ",".join(values) # 将内容合并 + merged_words.append(f"{key}:{combined_value}") + return merged_words +#获取文档中 详细设计方案 章节的所有内容 +def getDocxToText(docxPath, titleName, vector_store_path): + loopCount = 0 + while True: + loopCount += 1 + if loopCount >= 15: + raise Exception("文档读取超时,或文档存在问题无法读取") + break + try: + document = Document(docxPath) + break + except Exception as e: + time.sleep(1) + pass + + # 逐段读取docx文档的内容 + levelList = [] + words = [] + addStart = False + title_counter = [] # 用于存储当前标题的计数 + title_texts = [] # 用于存储当前各级标题的文本 + i = 0 + + for paragraph in document.paragraphs: + text = paragraph.text.strip() + if text: # 非空判断 + level = isTitle(paragraph) # 确保这个函数在代码中定义 + + # 当前标题的层级 + current_level = int(level) if level is not None else -1 + + if current_level >= 0: # 标题段落 + # 确保标题计数器足够长 + while len(title_counter) <= current_level: + title_counter.append(0) # 初始化新级别的标题计数 + title_texts.append('') # 初始化对应的标题文本 + + # 更新当前级别及以下的标题计数和标题文本 + title_counter[current_level] += 1 # 当前级别计数加1 + title_counter = title_counter[:current_level+1] + title_texts[current_level] = text # 保存当前级别的标题文本 + title_texts = title_texts[:current_level+1] + + # 重置更低级别的计数和标题文本 + for idx in range(current_level + 1, len(title_counter)): + title_counter[idx] = 0 + title_texts[idx] = '' + + # 检查是否与 titleName 匹配 + if current_level == 0: + addStart = titleName in text # 检查是否与 titleName 匹配 + + else: # 非标题段落 + if addStart: + if len(text) > 30: # 仅记录长度大于30的内容 + i += 1 + # 获取当前完整的标题编号和标题名称 + levelText = ".".join(map(str, title_counter)) + # 使用非空的标题名称 + current_title = title_texts[-1] if title_texts else '' + words.append(f"{levelText}-{current_title}:{text}") + + if len(words) == 0: + raise Exception("checkRepeatText,获取长度为0") + + # 使用封装的合并函数 + merged_words = merge_chapters(words) + + # 将合并后的内容写入 txt 文件 + with open("checkRepeatText.txt", 'w') as txt_file: + for line in merged_words: + txt_file.write(f"{line}\n") + + time.sleep(3) + + # 加载文本 + loader = TextLoader(file_path='checkRepeatText.txt') + docs = loader.load() + + # 创建唯一标识符 + uuids = [] + for _ in range(len(merged_words)): + uuids.append(str(uuid.uuid4())) + logging.info(f"checkRepeatTextuuidLen{len(uuids)}") + + return merged_words, uuids + + +# @app.route('/checkRepeatText/', methods=['GET']) +def checkRepeatText(filename): + yield "文档相似性检查---启动中...." + vector_store_path="vector_store"+str(uuid.uuid4()) + for titleName in findTitleName(filename): + yield titleName + if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"): + yield "文档相似性检查----文档内容解析中" + words,uuids=getDocxToText(filename,titleName,vector_store_path) + # 记录程序开始的时间戳‘ + reslist = [] + count = 0 + standard = { + "清晰性": """对软件功能描述的完整性主要体现在以下两个方面: + a. 功能描述是否简洁明了,避免使用过于复杂或专业的术语,使得用户能够轻松理解。 + b. 是否明确指出了功能的具体作用,没有模糊不清或含糊其辞的表述。 + 如果要将软件功能描述的清晰性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么? + 将软件功能描述的清晰性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义: + 优秀(90~100分) + 简洁明了:功能描述极其精炼,没有多余的词汇,每个字都承载着必要的信息。 + 通俗易懂:完全避免了专业术语或行业黑话,即使是非专业用户也能轻松理解。 + 具体明确:功能的作用、范围、限制以及用户期望的结果都被清晰、准确地阐述,没有任何模糊或含糊的表述。 + 良好(70分~90分,不包含90分) + 较为简洁:功能描述相对简短,但可能包含一些必要的细节或背景信息。 + 易于理解:大部分术语都是通俗易懂的,对于少数专业术语,提供了简短的解释或上下文。 + 明确具体:功能的主要作用、范围和用户期望的结果都被明确阐述,但可能在某些细节上稍显模糊。 + 一般(60~70分,不包含70分) + 稍显冗长:功能描述可能包含一些不必要的细节或重复信息,导致用户需要花费更多时间来理解。 + 有一定难度:使用了一些专业术语或行业黑话,但没有提供足够的解释或上下文,导致非专业用户可能难以理解。 + 基本明确:功能的主要作用被阐述,但在范围、限制或用户期望的结果上可能存在一些模糊或含糊的表述。 + 差(60分以下,不包含60分) + 冗长复杂:功能描述过于详细和复杂,包含大量不必要的细节和背景信息,导致用户难以抓住重点。 + 难以理解:大量使用专业术语或行业黑话,且没有提供任何解释或上下文,使得大部分用户都难以理解。 + 模糊不清:功能的作用、范围、限制以及用户期望的结果都没有被明确阐述,存在大量的模糊和含糊表述。 + 评估的提示词举例: + 根据这些评判标准,对下面的软件功能描述的清晰性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价,并给出具体得分。并在此基础上润色和完善,使之达到优秀的等级。 + """, + "完整性": """对软件功能描述的完整性主要体现在以下两个方面: + a. 是否涵盖了功能的所有重要方面,包括输入、输出、处理过程等。 + b. 是否提供了足够的信息,以便用户能够全面了解功能的工作原理和用途。 + 如果要将软件功能描述的完整性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么? + 将软件功能描述的完整性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义: + 优秀:(90~100分) + 描述全面涵盖了功能的所有重要方面,包括但不限于输入、输出、处理过程、异常处理等。 + 提供了详尽的信息,用户能够清晰地了解功能的工作原理、用途以及在不同场景下的表现。 + 包含了必要的示例、图表或流程图,以直观展示功能的工作流程和效果。 + 没有遗漏任何对用户理解和使用功能至关重要的信息。 + 良好:(70分~90分,不包含90分) + 描述基本涵盖了功能的主要方面,但可能有个别不太重要的细节未提及。 + 提供了足够的信息,用户能够较好地理解功能的工作原理和用途,但在某些复杂场景下可能需要额外说明。 + 可能包含一些示例或图表,但可能不如优秀等级那么全面或详细。 + 一般:(60~70分,不包含70分) + 描述涵盖了功能的一部分重要方面,但存在较明显的遗漏或不足。 + 提供的信息有限,用户可能只能对功能有一个大致的了解,无法深入了解其工作原理和详细用途。 + 可能缺乏示例、图表或流程图等辅助材料,导致用户难以理解功能的某些复杂部分。 + 差:(60分以下,不包含60分) + 描述严重缺失,未涵盖功能的关键方面,甚至可能误导用户。 + 提供的信息极少,用户无法全面了解功能的工作原理和用途。 + 可能存在错误或矛盾的信息,导致用户无法准确理解功能。 + 根据这些评判标准,对下面的软件功能描述的完整性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价。并在此基础上润色和完善,使之达到优秀的等级。 + """, + "可测试性": """软件功能描述的可测试性主要体现为以下方面: + a. 功能描述是否具体、明确,以便能够进行功能测试和验证。 + b. 是否提供了足够的细节,以便开发人员和测试人员能够准确理解和实现功能。 + 如果要将软件功能描述的可测试性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么? + 将软件功能描述的可测试性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义: + 优秀:(90~100分) + 功能描述非常具体和明确,能够直接转化为测试用例。 + 提供了详尽的细节,包括输入、输出、边界条件、异常处理等。 + 开发人员和测试人员能够轻松理解和实现功能,无需额外澄清或假设。 + 功能描述中包含了预期的行为和非预期的行为,有助于全面覆盖测试场景。 + 良好:(70分~90分,不包含90分) + 功能描述相对具体和明确,大部分内容可以直接用于测试。 + 提供了足够的细节,但可能需要一些额外的解释或澄清才能完全理解。 + 开发人员和测试人员能够基于描述实现和测试功能,但可能需要一些额外的沟通和协调。 + 功能描述中基本涵盖了主要的行为和边界条件,但可能缺少对某些异常情况的详细描述。 + 一般:(60~70分,不包含70分) + 功能描述较为笼统,需要较多的解释和澄清才能用于测试和开发。 + 细节不够充分,可能导致开发人员和测试人员在实现和测试过程中产生误解或遗漏。 + 需要较多的沟通和协调来确保功能的正确实现和测试。 + 功能描述中可能只涵盖了主要的行为,对边界条件和异常情况的描述较为模糊或缺失。 + 差:(60分以下,不包含60分) + 功能描述非常模糊和笼统,无法直接用于测试和开发。 + 缺乏必要的细节,导致开发人员和测试人员无法准确理解和实现功能。 + 需要大量的沟通和协调,甚至可能需要重新编写功能描述才能进行有效的测试和开发。 + 功能描述中可能只提到了大致的目标或意图,没有具体的行为描述、边界条件或异常处理。 + 根据这些评判标准,对下面的软件功能描述的可测试性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价。并在此基础上润色和完善,使之达到优秀的等级。 + """, + "详细性": """软件功能详细性主要体现在: + a. 功能描述是否详细,可以根据功能描述进行功能点评价,计算出ILF、EIF、EI、EO、EQ的数量; + 如果要将软件功能描述的详细性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么? + 将软件功能描述的详细性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义: + 优秀:(90~100分) + 功能描述非常详尽,包含了所有必要的信息,使得评估者能够轻松地根据描述进行功能点评价。 + ILF、EIF、EI、EO、EQ的数量可以明确且无误地计算出来,没有遗漏或模糊之处。 + 描述中不仅包含了功能的正常操作,还涵盖了异常处理、边界条件等特殊情况。 + 使用了具体的例子、流程图或伪代码来进一步阐明功能。 + 良好:(70分~90分,不包含90分) + 功能描述相对详细,提供了足够的信息来进行功能点评价。 + ILF、EIF、EI、EO、EQ的数量可以大致计算出来,但可能需要一些额外的解释或澄清。 + 描述中基本涵盖了功能的各个方面,但对某些细节或特殊情况可能描述不够充分。 + 整体而言,描述是清晰和准确的,但还有改进的空间。 + 一般:(60~70分,不包含70分) + 功能描述较为笼统,缺乏具体的细节。 + ILF、EIF、EI、EO、EQ的数量计算可能存在一定的困难或不确定性,需要较多的假设或推测。 + 描述中只涵盖了功能的主要方面,对细节和特殊情况的处理描述不足。 + 可能需要额外的沟通或澄清才能准确理解功能需求。 + 差:(60分以下,不包含60分) + 功能描述非常模糊,缺乏必要的信息和细节。 + 无法根据描述进行准确的功能点评价,ILF、EIF、EI、EO、EQ的数量无法确定。 + 描述中可能只提到了功能的大致目标或意图,没有具体的实现细节或操作步骤。 + 需要大量的额外信息或澄清才能理解功能需求,甚至可能需要重新编写功能描述。 + 根据这些评判标准,对下面的软件功能描述的详细性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价。并在此基础上润色和完善,使之达到优秀的等级。 + """, + } + weight = { + "清晰性" : 0.4, + "完整性" : 0.3, + "可测试性" : 0.2, + "详细性" : 0.1, + + } + + findTitleName_llm_cfg = { + 'model': "qwen2-72b", + 'model_server': 'http://127.0.0.1:1025/v1', + } + findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, name='Assistant') + for i in words: + count += 1 + yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}" + chapter, rest = i.split('-', 1) + title, text = rest.split(':', 1) + + # 生成字典 + example = { + "chapter": chapter.strip(), + "title": title.strip(), + "text": text.strip() + } + result = { + "title": title.strip(), + "text": text.strip() + } + # 循环提取键和值 + weighted_score = 0 + for key, value in standard.items(): + prompt_score = f"""对软件功能{key}的定义: + {value} + 模块名称:【{example['title']}】 + 模块描述:【{example['text']}】 + 回答格式为:{{"模块名称":"{example['text']}", + "等级":"优秀/良好/一般/差", + "得分":"0~100", + "理由及扣分原因":"理由及扣分原因", + }},不做过多的解释,严格按回答格式作答,只给出一个回答。 + """ + + messages = [({'role': 'user', 'content': prompt_score})] + runList = [] + for rsp in findTitleName_bot.run(messages): + runList.append(rsp) + data = runList[len(runList) - 1][0]["content"] + parsed_data = json_repair.loads(data.replace('`', '')) + if isinstance(parsed_data, list): # 检查parsed_data是否为列表 + parsed_data = parsed_data[0] # 取第一个元素 + else: + parsed_data = parsed_data + result[f"{key}等级"] = parsed_data['等级'] + result[f"{key}得分"] = parsed_data['得分'] + score = int(parsed_data['得分']) # 假设 '得分' 是字符串,需要转换为整数 + key_weight = weight.get(key, 0) # 根据键获取权重,如果没有匹配的权重,默认为 0 + # 计算加权得分并累加 + weighted_score += score * key_weight + result["加权得分"] = round(weighted_score, 2) # 保留两位小数 + answer = f"{example['text']}" + for key, value in standard.items(): + prompt_answer = f"""对软件功能{key}的定义:\n + {value}\n + 模块名称:【{example['title']}】\n + 模块描述:f【{answer}】\n + 回答格式为:{{"模块名称":"{example['text']}", + "改进后的描述":"改进后的描述", + }},不做过多的解释,严格按回答格式作答。 + """ + messages = [({'role': 'user', 'content': prompt_answer})] + runList = [] + for rsp in findTitleName_bot.run(messages): + runList.append(rsp) + data = runList[len(runList) - 1][0]["content"] + parsed_data = json_repair.loads(data.replace('`', '')) + answer = parsed_data['改进后的描述'] + result["改进后的描述"] = answer + textTag = i.split(":")[0] + breakpoint() + # vectorstore.delete(ids=uuids) + shutil.rmtree(vector_store_path) + resInfo=f"对{titleName}章节,发现相似内容:
" + if(len(reslist)>0): + for res in reslist: + resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"
在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"
以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】
" + yield resInfo + else: + yield "**未发现相似内容**" + userLog.info("文档相似性检查----未发现相似内容**") -# 创建一个记录器 -logger = logging.getLogger('my_logger') -logger.setLevel(logging.DEBUG) - -# 创建一个处理器 -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) - -# 创建一个格式化器并将其添加到处理器中 -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') -ch.setFormatter(formatter) - -# 将处理器添加到记录器中 -logger.addHandler(ch) -try: -# 记录一些日志消息 - logger.debug('这是一个调试消息') - logger.info('这是一个信息消息') - logger.warning('这是一个警告消息') - logger.error('这是一个错误消息') - logger.critical('这是一个致命错误消息') -except Exception as e: - logger.warning(e) \ No newline at end of file +for i in checkRepeatText("./北仑区综合行政执法局协同监管系统项目建设方案_20240824.docx"): + print(i) diff --git a/main.py b/main.py index 8e89845..9a11197 100644 --- a/main.py +++ b/main.py @@ -1,206 +1,286 @@ -from flask import Flask, request, jsonify, Response +# from flask import Flask, request, jsonify, Response import os from checkPlaceName import checkPlaceName from checkRepeatText import checkRepeatText from checkCompanyName import checkCompanyName from checkDocumentError import checkDocumentError from checkTitleName import checkTitleName -from flask_cors import CORS +# from flask_cors import CORS import qwen_agenttext from myLogger import outLog import time -app = Flask(__name__) -cros = CORS(app) +# app = Flask(__name__) +# cros = CORS(app) +import uvicorn +from fastapi import FastAPI, Request, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from sse_starlette.sse import EventSourceResponse +import asyncio + +app = FastAPI() +# 允许所有来源的跨域请求 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"] +) + UPLOAD_FOLDER = 'uploads' if not os.path.exists(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) -@app.route('/upload', methods=['POST']) -def upload_file(): - if 'file' not in request.files: - return jsonify({"error": "No file part"}), 400 - file = request.files['file'] - if file.filename == '': - return jsonify({"error": "No selected file"}), 400 - if file: - filename = file.filename - file.save(os.path.join(UPLOAD_FOLDER, filename)) - return jsonify({"message": "File uploaded successfully"}), 200 - - -@app.route('/stream', methods=["GET", "POST"]) -def stream_numbers(): - context = request.args.get('context') - # def generate_numbers(): - # event_id=0 - # for number in range(1, 10): - # json_data = json.dumps({"number": number}) - # print(json_data) - # event_id += 1 - # yield f"id: {event_id}\n" - # yield f"event: time-update\n" - # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 - # time.sleep(0.5) # 为了演示,加入短暂延迟 - # json_data = json.dumps({"number": "done"}) - # yield f"id: {1}\n" - # yield f"event: time-update\n" - # yield f"data: {json_data}\n\n" # 发送完成信号 - - headers = { - "Content-Type": "text/event-stream", - "Cache-Control": "no-cache", - "X-Accel-Buffering": "no", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET,POST", - "Access-Control-Allow-Headers": "x-requested-with,content-type", - } - return Response(qwen_agenttext.getxinx(context), headers=headers) - - -@app.route('/sse/checkRepeatText', methods=['GET']) -def checkRepeatTextWeb(): - filename = request.args.get('filename') - userId = request.args.get("userId") - - def generate_checkRepeatText(filename,userId): +# @app.route('/upload', methods=['POST']) +# def upload_file(): +# if 'file' not in request.files: +# return jsonify({"error": "No file part"}), 400 +# file = request.files['file'] +# if file.filename == '': +# return jsonify({"error": "No selected file"}), 400 +# if file: +# filename = file.filename +# file.save(os.path.join(UPLOAD_FOLDER, filename)) +# return jsonify({"message": "File uploaded successfully"}), 200 +@app.post("/sse/upload") +async def upload_file(file: UploadFile = File(...)): + if not file.filename: + raise HTTPException(status_code=400, detail="No selected file") + + # 保存文件 + try: + file_location = os.path.join(UPLOAD_FOLDER, file.filename) + with open(file_location, "wb") as f: + content = await file.read() + f.write(content) + return JSONResponse(content={"message": "文件上传成功"}, status_code=200) + except Exception as e: + raise HTTPException(status_code=500, detail="文件上传失败,错误信息:" + str(e)) + + +@app.get("/sse") +async def root(request: Request): + async def event_generator(request: Request): + res_str = "七夕情人节即将来临,我们为您准备了精美的鲜花和美味的蛋糕" + for i in res_str: + if await request.is_disconnected(): + print("连接已中断") + break + yield { + "event": "message", + "id": "7", + "data": f"{i}" + } + + await asyncio.sleep(0.1) + + g = event_generator(request) + return EventSourceResponse(g) + + +# def stream_numbers(): +# context = request.args.get('context') +# # def generate_numbers(): +# # event_id=0 +# # for number in range(1, 10): +# # json_data = json.dumps({"number": number}) +# # print(json_data) +# # event_id += 1 +# # yield f"id: {event_id}\n" +# # yield f"event: time-update\n" +# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 +# # time.sleep(0.5) # 为了演示,加入短暂延迟 +# # json_data = json.dumps({"number": "done"}) +# # yield f"id: {1}\n" +# # yield f"event: time-update\n" +# # yield f"data: {json_data}\n\n" # 发送完成信号 + +# headers = { +# "Content-Type": "text/event-stream", +# "Cache-Control": "no-cache", +# "X-Accel-Buffering": "no", +# "Access-Control-Allow-Origin": "*", +# "Access-Control-Allow-Methods": "GET,POST", +# "Access-Control-Allow-Headers": "x-requested-with,content-type", +# } +# return Response(qwen_agenttext.getxinx(context), headers=headers) + +@app.get("/sse/checkRepeatText") +async def checkRepeatTextWeb(filename, userId, request: Request): + async def generate_checkRepeatText(filename, userId, request: Request): + global outLog id = 0 - for i in checkRepeatText(filename,userId): - yield f"id: {id + 1}\n" - yield f"event: checkRepeatText\n" - yield f"data: {i}\n\n" # 发送完成信号 - # except Exception as e: - - # yield f"id: {id+1}\n" - # yield f"event: checkRepeatText\n" - # yield f"data: **程序出现异常**\n\n" # 发送完成信号 - - headers = { - "Content-Type": "text/event-stream", - "Cache-Control": "no-cache", - "X-Accel-Buffering": "no", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET,POST", - "Access-Control-Allow-Headers": "x-requested-with,content-type", - } - return Response(generate_checkRepeatText(filename,userId), headers=headers) - - -@app.route('/sse/checkPlaceName', methods=['GET']) -def checkPlaceNameWebSse(): - filename = request.args.get('filename') - userId = request.args.get("userId") - def generate_checkPlaceName(filename,userId): + for i in checkRepeatText(filename, userId, outLog): + id += 1 + if await request.is_disconnected(): + yield { + "id": f"{id}", + "event": "checkRepeatText", + "data": "checkRepeatText连接已中断" + } + break + yield { + "id": f"{id}", + "event": "checkRepeatText", + "data": i + } + + g = generate_checkRepeatText(filename, userId, request) + return EventSourceResponse(g) + + +@app.get('/sse/checkPlaceName') +def checkPlaceNameWebSse(filename, userId, request: Request): + async def generate_checkPlaceName(filename, userId, request: Request): id = 0 - for i in checkPlaceName(filename,userId): - yield f"id: {id + 1}\n" - yield f"event: checkPlaceName\n" - yield f"data: {i}\n\n" # 发送完成信号 - - headers = { - "Content-Type": "text/event-stream", - "Cache-Control": "no-cache", - "X-Accel-Buffering": "no", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET,POST", - "Access-Control-Allow-Headers": "x-requested-with,content-type", - } - return Response(generate_checkPlaceName(filename,userId), headers=headers) - - -@app.route('/sse/checkCompanyName', methods=['GET']) -def checkCompanyNameWebSse(): - filename = request.args.get('filename') - userId = request.args.get("userId") - def generate_checkCompanyName(filename,userId): + global outLog + for i in checkPlaceName(filename, userId, outLog): + id += 1 + if await request.is_disconnected(): + yield { + "id": f"{id}", + "event": "checkPlaceName", + "data": "checkPlaceName连接已中断" + } + break + yield { + "id": f"{id}", + "event": "checkPlaceName", + "data": i + } + + g = generate_checkPlaceName(filename, userId, request) + return EventSourceResponse(g) + + +@app.get('/sse/checkCompanyName') +def checkCompanyNameWebSse(filename, userId, request: Request): + async def generate_checkCompanyName(filename, userId, request: Request): id = 0 - for i in checkCompanyName(filename,userId): - yield f"id: {id + 1}\n" - yield f"event: checkCompanyName\n" - yield f"data: {i}\n\n" # 发送完成信号 - - headers = { - "Content-Type": "text/event-stream", - "Cache-Control": "no-cache", - "X-Accel-Buffering": "no", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET,POST", - "Access-Control-Allow-Headers": "x-requested-with,content-type", - } - return Response(generate_checkCompanyName(filename,userId), headers=headers) - - -@app.route('/sse/checkDocumentErrorWeb', methods=['GET']) -def checkDocumentErrorWebSse(): - filename = request.args.get('filename') - userId = request.args.get("userId") - def generate_checkDocumentError(filename,userId): + global outLog + for i in checkCompanyName(filename, userId, outLog): + id += 1 + if await request.is_disconnected(): + yield { + "id": f"{id}", + "event": "checkCompanyName", + "data": "checkCompanyName连接已中断" + } + break + yield { + "id": f"{id}", + "event": "checkCompanyName", + "data": i + } + + g = generate_checkCompanyName(filename, userId, request) + return EventSourceResponse(g) + + +@app.get('/sse/checkDocumentErrorWeb') +def checkDocumentErrorWebSse(filename, userId, request: Request): + async def generate_checkDocumentError(filename, userId, request: Request): id = 0 - for i in checkDocumentError(filename,userId): - yield f"id: {id + 1}\n" - yield f"event: checkDocumentError\n" - yield f"data: {i}\n\n" # 发送完成信号 - - headers = { - "Content-Type": "text/event-stream", - "Cache-Control": "no-cache", - "X-Accel-Buffering": "no", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET,POST", - "Access-Control-Allow-Headers": "x-requested-with,content-type", - } - return Response(generate_checkDocumentError(filename,userId), headers=headers) - - -@app.route('/sse/checkTitleName', methods=['GET']) -def checkTitleNameWebSse(): - filename = request.args.get('filename') - userId = request.args.get("userId") - def generate_checkTitleName(filename,userId): + global outLog + for i in checkDocumentError(filename, userId, outLog): + id += 1 + if await request.is_disconnected(): + yield { + "id": f"{id}", + "event": "checkDocumentError", + "data": "checkDocumentError连接已中断" + } + break + yield { + "id": f"{id}", + "event": "checkDocumentError", + "data": i + } + + g = generate_checkDocumentError(filename, userId, request) + return EventSourceResponse(g) + + +@app.get('/sse/checkTitleName') +def checkTitleNameWebSse(filename, userId, request: Request): + async def generate_checkTitleName(filename, userId, request: Request): id = 0 - for i in checkTitleName(filename,userId): - yield f"id: {id + 1}\n" - yield f"event: checkTitleName\n" - yield f"data: {i}\n\n" # 发送完成信号 - - headers = { - "Content-Type": "text/event-stream", - "Cache-Control": "no-cache", - "X-Accel-Buffering": "no", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET,POST", - "Access-Control-Allow-Headers": "x-requested-with,content-type", - } - return Response(generate_checkTitleName(filename,userId), headers=headers) - -@app.route('/sse/getLog', methods=['GET']) -def getlog(): - userId = request.args.get("userId") - def generate_getLog(userId): - time.sleep(1) + global outLog + for i in checkTitleName(filename, userId, outLog): + id += 1 + if await request.is_disconnected(): + yield { + "id": f"{id}", + "event": "checkTitleName", + "data": "checkTitleName连接已中断" + } + break + yield { + "id": f"{id}", + "event": "checkTitleName", + "data": i + } + + g = generate_checkTitleName(filename, userId, request) + return EventSourceResponse(g) + + +@app.get("/sse/getLog") +# @app.route('/sse/getLog', methods=['GET']) +async def getlog(userId, request: Request): + # userId = request.args.get("userId") + async def generate_getLog(userId): id = 0 + global outLog + await asyncio.sleep(5) while True: - if outLog.is_done(userId): + isbreak = outLog.is_done(userId) + if isbreak: + break # 完成了 + text = outLog.get_queueData(userId) + if await request.is_disconnected(): + yield { + "id": f"{id}", + "event": "checkTitleName", + "data": "checkTitleName连接已中断" + } break - q = outLog.get_queueData(userId) - if q: - id+=1 - text = q.pop(0) - yield f"id: {id}\n" - yield f"event: getlog\n" - yield f"data: {text}\n\n" # 发送完成信号 - yield f"id: {id}\n" - yield f"event: getlog\n" - yield f"data: 任务结束!!!!!\n\n" # 发送完成信号 + if text: + id += 1 + yield { + "id": id, + "event": "getlog", + "data": text + } + # yield f"id: {id}\n" + # yield f"event: getlog\n" + # yield f"data: {text}\n\n" # 发送完成信号 + # yield f"id: {id}\n" + # yield f"event: getlog\n" + # yield f"data: 任务结束!!!!!\n\n" # 发送完成信号 + yield { + "id": id, + "event": "getlog", + "data": "任务结束!!!!" + } outLog.del_queue(userId) - headers = { - "Content-Type": "text/event-stream", - "Cache-Control": "no-cache", - "X-Accel-Buffering": "no", - "Access-Control-Allow-Origin": "*", - "Access-Control-Allow-Methods": "GET,POST", - "Access-Control-Allow-Headers": "x-requested-with,content-type", - } - return Response(generate_getLog(userId), headers=headers) + + # headers = { + # "Content-Type": "text/event-stream", + # "Cache-Control": "no-cache", + # "X-Accel-Buffering": "no", + # "Access-Control-Allow-Origin": "*", + # "Access-Control-Allow-Methods": "GET,POST", + # "Access-Control-Allow-Headers": "x-requested-with,content-type", + # } + g = generate_getLog(userId) + return EventSourceResponse(g) + # return Response(generate_getLog(userId), headers=headers) + + if __name__ == '__main__': - app.run(host="0.0.0.0", port=80) + # app.run(host="0.0.0.0", port=80,threaded=True) + # uvicorn.run(app='main:app', host="0.0.0.0", port=80,workers=1) + app.run() diff --git a/myLogger.py b/myLogger.py index 6ea3059..7244d53 100644 --- a/myLogger.py +++ b/myLogger.py @@ -1,117 +1,8 @@ # -*- coding: utf-8 -*- -""" -@author: bingyl123@163.com -@version: 1.0.0 -@file: OutLog.py -@time: 2023/2/23 20:25 -""" -# import logging -# import logging.config -# import re -# import datetime -# import queue -# -# -# class OutLog: -# _instance = None -# logger = None -# -# def __new__(cls): -# if cls._instance is None: -# cls._instance = super(OutLog, cls).__new__(cls) -# cls.logger = logging.getLogger("app") # 默认logger名称为"app" -# cls._instance.queue_dict = {} -# cls._instance.done_dict = {} -# return cls._instance -# -# def get_queue(self, user_id): -# if user_id not in self.queue_dict: -# self.queue_dict[user_id] = [] -# self.done_dict[user_id] = {} # 初始化为未完成的字典 -# return self.queue_dict[user_id] -# -# def mark_done(self, user_id, producer_name): -# self.done_dict[user_id][producer_name] = True -# -# def is_done(self, user_id): -# return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成 -# @staticmethod -# def put(item: str, level="INFO"): -# dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") -# mq.put(f"{dtf}[{level}]: {item}") -# -# @staticmethod -# def debug(item, log=True): -# OutLog.put(item, level="DEBUG") -# if log: -# OutLog._instance.logger.debug(item) -# -# @staticmethod -# def info(item, log=True): -# OutLog.put(item, level="INFO") -# if log: -# OutLog._instance.logger.info(item) -# -# @staticmethod -# def warning(item, log=True): -# OutLog.put(item, level="WARNING") -# if log: -# OutLog._instance.logger.warning(item) -# -# @staticmethod -# def error(item, log=True): -# OutLog.put(item, level="ERROR") -# if log: -# OutLog._instance.logger.error(item) -# -# @staticmethod -# def critical(item, log=True): -# OutLog.put(item, level="CRITICAL") -# if log: -# OutLog._instance.logger.critical(item) -# -# -# -# # 日志配置 -# log_config = { -# 'version': 1, -# 'disable_existing_loggers': False, -# 'formatters': { -# 'standard': { -# 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', -# }, -# }, -# 'handlers': { -# 'console': { -# 'class': 'logging.StreamHandler', -# 'formatter': 'standard', -# 'level': logging.INFO, -# }, -# 'file': { -# 'class': 'logging.FileHandler', -# 'filename': 'Logger.log', -# 'formatter': 'standard', -# 'level': logging.WARNING, -# }, -# }, -# 'loggers': { -# '': { -# 'handlers': ['console', 'file'], -# 'level': logging.WARNING, -# 'propagate': True, -# }, -# } -# } -# -# logging.config.dictConfig(log_config) -# -# outLog = OutLog() # 获取单例实例 - - - import logging import logging.config import datetime +import redis class OutLog: _instance = None @@ -121,35 +12,49 @@ class OutLog: if cls._instance is None: cls._instance = super(OutLog, cls).__new__(cls) cls.logger = logging.getLogger("app") # 默认logger名称为"app" - cls._instance.queue_dict = {} - cls._instance.done_dict = {} + # cls._instance.queue_dict = {} + # cls._instance.done_dict = {} + # 初始化 Redis 连接 + cls._instance.redis_client = redis.StrictRedis(host='localhost', port=6379, password="root",db=0, decode_responses=True) return cls._instance - def get_queue(self, user_id,producer_name): - if user_id not in self.queue_dict: - self.queue_dict[user_id] = [] - self.done_dict[user_id] = {} # 初始化为未完成的字典 - if user_id not in self.done_dict: - self.done_dict[user_id][producer_name] = False + def get_queue(self,user_id,producer_name): + # if user_id not in self.queue_dict: + # self.queue_dict[user_id] = [] + # self.done_dict[user_id]={} + # self.done_dict[user_id][producer_name] = False # 初始化为未完成的字典 + # 使用 Redis 进行存储和查询 + if not self.redis_client.exists(f"queue:{user_id}"): + # self.redis_client.rpush(f"queue:{user_id}") + self.logger.info(f"queue:{user_id}") + self.redis_client.hset(f"done:{user_id}", producer_name, "0") # 初始化为未完成 return self.UserLogger(user_id) def get_queueData(self, user_id): - if user_id in self.queue_dict: - return OutLog._instance.queue_dict[self.user_id] + # if user_id in self.queue_dict: + # return self.queue_dict[user_id] + if self.redis_client.exists(f"queue:{user_id}"): + return self.redis_client.lpop(f"queue:{user_id}") # 获取队列首个并删除数据 def del_queue(self,user_id): + # if self.is_done(user_id): + # del self.queue_dict[user_id] + # del self.done_dict[user_id] if self.is_done(user_id): - del self.queue_dict[user_id] - del self.done_dict[user_id] + self.redis_client.delete(f"queue:{user_id}") + self.redis_client.delete(f"done:{user_id}") class UserLogger: def __init__(self, user_id): self.user_id = user_id self.logger = OutLog._instance.logger def log(self, item: str, level: str): + self._log_to_logger(item, level) + if(level != "INFO"): + return dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") log_entry = f"{dtf}[{level}]: {item}" - OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列 - self._log_to_logger(item, level) - + # print(log_entry) + # OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列 + OutLog._instance.redis_client.rpush(f"queue:{self.user_id}", log_entry) # 保存到对应用户的队列 def _log_to_logger(self, item: str, level: str): if level == "DEBUG": self.logger.debug(item) @@ -177,11 +82,17 @@ class OutLog: def critical(self, item: str): self.log(item, "CRITICAL") + # def mark_done(self, user_id, producer_name): + # self.done_dict[user_id][producer_name] = True + # def is_done(self, user_id): + # # print(self.done_dict.get(user_id, {}),self.done_dict.get(user_id, {}).values()) + # return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成 def mark_done(self, user_id, producer_name): - self.done_dict[user_id][producer_name] = True + self.redis_client.hset(f"done:{user_id}", producer_name, "1") def is_done(self, user_id): - return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成 + done_dict = self.redis_client.hgetall(f"done:{user_id}") + return all(value == "1" for value in done_dict.values()) if done_dict else False # 检查所有生产者是否完成 # 日志配置 @@ -203,13 +114,13 @@ log_config = { 'class': 'logging.FileHandler', 'filename': 'Logger.log', 'formatter': 'standard', - 'level': logging.WARNING, + 'level': logging.INFO, }, }, 'loggers': { '': { 'handlers': ['console', 'file'], - 'level': logging.WARNING, + 'level': logging.INFO, 'propagate': True, }, }