diff --git a/checkCompanyName.py b/checkCompanyName.py
index 4d2f1fd..1735ff2 100644
--- a/checkCompanyName.py
+++ b/checkCompanyName.py
@@ -8,9 +8,10 @@ import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import requests
-from myLogger import outLog
+# from myLogger import outLog
import time
+
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@@ -31,9 +32,9 @@ _SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
-outLog.logger = logging.getLogger("checkCompanyName")
-userLog=None
-prompt ='''
+# outLog.logger = logging.getLogger("checkCompanyName")
+userLog = None
+prompt = '''
.根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询,
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案,
回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
@@ -54,8 +55,8 @@ def getDocxToTextAll(name):
docxPath = name
loopCount = 0
while True:
- loopCount+=1
- if(loopCount>=15):
+ loopCount += 1
+ if (loopCount >= 60):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
@@ -76,17 +77,16 @@ def getDocxToTextAll(name):
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
text = '\n'.join(words)
- # userLog.info("checkCompanyName----保存文件")
# 将文本写入txt文件
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
def companyNameTask(text):
- yield "文档公司或组织名称检查---启动中...."
- userLog.info("checkCompanyName----启动中....")
- batchNum = 20
- sentences = re.split(r'[。\n]', text)
+ yield "文档公司或组织名称检查---文档解析中...."
+ userLog.info("文档公司或组织名称检查---任务开始")
+ batchNum = 5
+ sentences = re.split(r'[、,。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数
@@ -101,19 +101,19 @@ def companyNameTask(text):
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}"
- userLog.info(f"checkCompanyName----文档解析进度:{i + 1}/{num_chunks}")
try:
- wenBen = ".".join(chunk)
- url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
+ # wenBen = ".".join(chunk)
+ url = "http://0.0.0.0:8191/taskflow/checkPlaceNameServer"
headers = {"Content-Type": "application/json"}
data = {
"data": {
- "text": wenBen,
+ "text": chunk,
+ # "text":wenBen
}
}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text)
- # userLog.info(res)
+ res = res["data"]
# print(res)
except Exception as e:
userLog.warning(chunk)
@@ -121,44 +121,52 @@ def companyNameTask(text):
userLog.warning(e)
return
isplace = False
- for zuhe in res["result"]:
+
+ # for zuhe in res:
+ # # 上一个的地名,这一个还是地名,就和上一个相加代替这个
+ # if isplace:
+ # name = placeList[len(placeList) - 1]
+ # if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns"
+ # isplace = True
+ # new_text = zuhe[0].replace("\n", "")
+ # placeList[len(placeList) - 1] = name + new_text
+ # continue
+ # if zuhe[1].find("组织机构类") >= 0:
+ # isplace = True
+ # new_text = zuhe[0].replace("\n", "")
+ # placeList.append(new_text)
+ # else:
+ # isplace = False
+ ##案例[[('目前', 'TIME'), ('江北区历史文化档案馆', 'ORG')], [('宁波国研简直,并且在东软', 'ORG'), ('宁波市北仑区教育局', 'ORG'), ('国研信息', 'ORG'), ('浙江省', 'LOC'), ('宁波市金凤区', 'LOC'), ('金凤区', 'LOC')]]
+ for zuhe in res:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
- if isplace:
- name = placeList[len(placeList) - 1]
- if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns"
- isplace = True
- new_text = zuhe[0].replace("\n", "")
- placeList[len(placeList) - 1] = name + new_text
- continue
- if zuhe[1].find("组织机构类") >= 0:
- isplace = True
- new_text = zuhe[0].replace("\n", "")
- placeList.append(new_text)
- else:
- isplace = False
+ for chid in zuhe:
+ if (chid[1] == "ORG"):
+ new_text = chid[0].replace("\n", "")
+ placeList.append(new_text)
# 打印总份数
yield "文档公司或组织名称检查---文档解析完成"
- userLog.info("checkCompanyName----文档解析完成")
placeList = list(dict.fromkeys(placeList))
+ userLog.debug(placeList)
yield placeList
- userLog.info(placeList)
-def checkCompanyName(filename,user_id):
+
+def checkCompanyName(filename, user_id, outLog):
yield f"文档公司或组织名称检查---开始处理文档..."
global userLog
- userLog=outLog.get_queue(user_id, "checkCompanyName")
+ userLog = outLog.get_queue(user_id, "checkCompanyName")
try:
getDocxToTextAll(filename)
except Exception as e:
userLog.warning(e)
userLog.warning("文档公司或组织名称检查---文档无法打开,请检查文档内容")
- yield "文档公司或组织名称检查---文档无法打开,请检查文档内容"
+ yield "文档公司或组织名称检查---文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
outLog.mark_done(user_id, "checkCompanyName")
return
with open("checkCompanyName.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送
- userLog.info("checkCompanyName----开始解析文档...")
+ final_list = ""
for item in companyNameTask(gettext):
if isinstance(item, str):
yield item
@@ -174,7 +182,6 @@ def checkCompanyName(filename,user_id):
if cishu > 3:
cishu = 0
yield "文档公司或组织名称检查---结果生成中" + '.' * cishu
- userLog.info(f"checkCompanyName----结果生成中" + '.' * cishu)
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
@@ -182,14 +189,15 @@ def checkCompanyName(filename,user_id):
for place in parsed_data:
try:
- if place['回答'] == '非泛化的公司或组织名称':
+ if place['回答'] == '具体的公司或组织名称':
+ if (place["companyName"] == "北京国研科技咨询有限公司浙江分公司"):
+ continue
error_places.append(place)
except Exception as e:
userLog.warning(place)
userLog.warning(e)
userLog.warning("文档公司或组织名称检查---组织提出出错")
continue
- userLog.info(error_places)
returnInfo = "发现异常公司或组织名称
"
if len(error_places) > 0:
for t in error_places:
@@ -199,9 +207,9 @@ def checkCompanyName(filename,user_id):
t["yuanwen"] = paragraphs[0]
yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "")
returnInfo += "原文:" + yuanwen + "
异常公司或组织名称:**" + keyword + "**!请注意" + "
"
- userLog.info(returnInfo)
+ userLog.info("文档公司或组织名称检查---原文:" + yuanwen + "异常公司或组织名称:" + keyword + "!请注意")
yield returnInfo
else:
yield "**未发现异常公司或组织名称**
"
- userLog.info("**未发现异常公司或组织名称**
")
- outLog.mark_done(user_id, "checkCompanyName")
\ No newline at end of file
+ userLog.info("文档公司或组织名称检查---未发现异常公司或组织名称")
+ outLog.mark_done(user_id, "checkCompanyName")
diff --git a/checkDocumentError.py b/checkDocumentError.py
index 33d7ed4..8728136 100644
--- a/checkDocumentError.py
+++ b/checkDocumentError.py
@@ -8,7 +8,7 @@ import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import requests
-from myLogger import outLog
+# from myLogger import outLog
import time
def load_from_xml_v2(baseURI, rels_item_xml):
"""
@@ -27,9 +27,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
-import logging
+# import logging
-outLog.logger = logging.getLogger("checkDocumentError")
+# outLog.logger = logging.getLogger("checkDocumentError")
userLog=None
llm_cfg = {
# 'model': 'qwen1.5-72b-chat',
@@ -40,7 +40,7 @@ llm_cfg = {
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
-
+ system_message="你是一个错别字分析大师"
)
# prompt='''
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案,
@@ -48,25 +48,25 @@ bot = Assistant(llm=llm_cfg,
# '''
prompt = '''
请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出详细的解析,没有错则不用给解析
-回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答;
+回答格式请按照以下json格式[{"placeName":"序号值","回答":"答案","解析","解析内容"},{"placeName":"序号值","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答;
'''
def getDocxToTextAll(name):
- userLog.info("checkDocumentError----打开文档")
docxPath = name
loopCount = 0
- while True:
- loopCount+=1
- if(loopCount>=15):
- raise Exception("文档读取超时,或文档存在问题无法读取")
- break
- try:
- document = Document(docxPath)
- break
- except Exception as e:
- time.sleep(1)
- pass
+ document = Document(docxPath)
+ # while True:
+ # loopCount+=1
+ # if(loopCount>=60):
+ # raise Exception("文档读取超时,或文档存在问题无法读取")
+ # break
+ # try:
+ # document = Document(docxPath)
+ # break
+ # except Exception as e:
+ # time.sleep(1)
+ # pass
# 逐段读取docx文档的内容
words = []
for paragraph in document.paragraphs:
@@ -84,23 +84,21 @@ def getDocxToTextAll(name):
txt_file.write(text)
-def checkDocumentError(filename,user_id):
+def checkDocumentError(filename,user_id,outLog):
global userLog
userLog=outLog.get_queue(user_id,"checkDocumentError")
yield f"文档纠错---开始处理文档..."
- userLog.info("checkDocumentError----开始处理文档...")
try:
getDocxToTextAll(filename)
except Exception as e:
userLog.warning(e)
userLog.warning("文档纠错----文档无法打开,请检查文档内容")
- yield "文档纠错----文档无法打开,请检查文档内容"
+ yield "文档纠错----文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
outLog.mark_done(user_id, "checkDocumentError")
return
with open("checkDocumentError.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送
- userLog.info("checkDocumentError----开始解析文档...")
final_list = []
for item in documentErrorTask(gettext):
if isinstance(item, str):
@@ -113,12 +111,11 @@ def checkDocumentError(filename,user_id):
yuanwen = i["placeName"].replace("\n", "")
jianyi = i["jianyi"].replace("\n", "")
resInfo += "原文:" + yuanwen + "
建议:**" + jianyi + "**
"
- userLog.info(resInfo)
yield resInfo
else:
yield "**未发现错别字**"
- userLog.info("未发现错别字")
+ userLog.info("文档纠错---未发现错别字")
outLog.mark_done(user_id,"checkDocumentError")
@@ -129,27 +126,33 @@ def documentErrorTask(text):
:param batch_size: 每批处理的字符数
:return: 生成器,每次返回一批文本
"""
- yield "文档纠错---启动中...."
- userLog.info("checkDocumentError----启动中....")
+ yield "文档纠错---文档解析中...."
+ userLog.info("文档纠错---任务开始")
batchNum = 20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数
total_chars = len(sentences)
-
# 计算有多少份
num_chunks = math.ceil(total_chars / batchNum)
-
# 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
# 打印每一份的内容
err = []
for i, chunk in enumerate(chunks):
yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"
- userLog.info(f"checkDocumentError----文档解析进度:{i + 1}/{num_chunks}")
try:
- url = "http://0.0.0.0:8190/taskflow/checkDocumentError"
+ # url = "http://0.0.0.0:8190/taskflow/checkDocumentError"
+ # headers = {"Content-Type": "application/json"}
+ # data = {
+ # "data": {
+ # "text": chunk,
+ # }
+ # }
+ # r = requests.post(url=url, headers=headers, data=json.dumps(data))
+ # res = json.loads(r.text)
+ url = "http://127.0.0.1:5001/taskflow/checkDocumentError"
headers = {"Content-Type": "application/json"}
data = {
"data": {
@@ -158,12 +161,13 @@ def documentErrorTask(text):
}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text)
- # print(res)
except Exception as e:
userLog.warning(chunk)
- userLog.warning("文档纠错--错别字识别出错\n", e)
+ userLog.warning("文档纠错--错别字识别出错\n")
+ userLog.warning(e)
continue
- lines_with_greeting = [place for place in res["result"] if len(place['errors']) > 0]
+ lines_with_greeting = [place for place in res["data"] if len(place['errors']) > 0]
+ userLog.debug(lines_with_greeting)
if len(lines_with_greeting) > 0:
num = 0
wenti = [] # 记录问题的数组
@@ -173,26 +177,28 @@ def documentErrorTask(text):
keyword = t['source']
keyword_list.append(keyword)
for item in t["errors"]:
- for key, value in item['correction'].items():
- temp_errorWords.append(key)
+ # for key, value in item['correction'].items():
+ # temp_errorWords.append(key)
+ temp_errorWords.append(item[0])
wenti.append(
- "序号:{},原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
+ # "{}:原文是{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
+ "{}:原文是{}。问题:当前原文是否存在错别字,只检查错被子,其他不做分析".format(num, keyword))
num += 1
words = "\n".join(wenti)
+ userLog.debug(words)
messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]
runList = []
yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送
- userLog.info(f"checkDocumentError----内容解析中...")
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
if cishu > 3:
cishu = 0
yield "文档纠错---内容解析中" + '.' * cishu
- userLog.info(f"checkDocumentError----内容解析中内容解析中" + '.' * cishu)
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))
+ userLog.debug(parsed_data)
resListerr = []
for place in parsed_data:
try:
@@ -200,14 +206,16 @@ def documentErrorTask(text):
place["placeName"] = keyword_list[int(place["placeName"])]
place["jianyi"] = place["解析"]
resListerr.append(place)
+ userLog.info("文档纠错---原文:" + place["placeName"] + "
建议:" + place["jianyi"])
except Exception as e:
userLog.warning(parsed_data)
userLog.warning(place)
- userLog.warning("文档纠错--错别字提取出错\n", e)
+ userLog.warning("文档纠错--错别字提取出错\n")
+ userLog.warning(e)
continue
if (len(resListerr) > 0):
err.extend(resListerr)
# 打印总份数
- yield "文档地名检查---文档解析完成"
- userLog.info(err)
- yield err
+ yield "文档纠错---文档解析完成"
+ userLog.info("文档纠错---任务结束")
+ yield err
\ No newline at end of file
diff --git a/checkPlaceName.py b/checkPlaceName.py
index 851827d..5c69bc3 100644
--- a/checkPlaceName.py
+++ b/checkPlaceName.py
@@ -87,7 +87,6 @@ def getDocxToTextAll(docxPath):
#得到全文和地名有关的内容
def placeNameTask(text):
yield "文档地名检查---启动中...."
- userLog.info("checkPlaceName----启动中....")
batchNum=20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
@@ -104,7 +103,6 @@ def placeNameTask(text):
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
- userLog.info(f"checkPlaceName----文档解析进度:{i + 1}/{num_chunks}")
wenBen=".".join(chunk)
try:
url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
@@ -139,7 +137,6 @@ def placeNameTask(text):
isplace = False
# 打印总份数
yield "文档地名检查---文档解析完成"
- userLog.info("checkPlaceName---文档解析完成")
placeList=list(dict.fromkeys(placeList))
yield placeList
@@ -175,7 +172,6 @@ def checkPlaceName(filename,user_id):
if cishu>3:
cishu=0
yield "文档地名检查---结果生成中"+'.'*cishu
- userLog.info("checkPlaceName---结果生成中"+'.'*cishu)
cishu+=1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
@@ -186,12 +182,11 @@ def checkPlaceName(filename,user_id):
if place['回答'] == '错误':
error_places.append(place)
except Exception as e:
- userLog.warning(parsed_data)
userLog.warning(place)
+ userLog.warning(parsed_data)
userLog.warning("文档地名检查---组织提出出错")
userLog.warning(e)
continue
- userLog.info(error_places)
returnInfo = "发现异常地名
"
if len(error_places)>0:
for t in error_places:
@@ -200,9 +195,9 @@ def checkPlaceName(filename,user_id):
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
returnInfo+="原文:" + yuanwen + "
出现异常地名:**" + keyword + "**!请注意" + "
"
- userLog.info(returnInfo)
+ userLog.info("文档地名检查---原文:" + yuanwen + "出现异常地名:" + keyword + "!请注意")
yield returnInfo
else:
yield "**未发现发现异常地名**"
- userLog.info("未发现发现异常地名")
- outLog.mark_done(user_id, "checkPlaceName")
\ No newline at end of file
+ userLog.info("文档地名检查---未发现发现异常地名")
+ outLog.mark_done(user_id, "checkPlaceName")
\ No newline at end of file
diff --git a/checkRepeatText.py b/checkRepeatText.py
index c8688e7..ee5309e 100644
--- a/checkRepeatText.py
+++ b/checkRepeatText.py
@@ -7,6 +7,7 @@ from qwen_agent.agents import Assistant
import json_repair
import json
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
+# embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese",model_kwargs={"device":"npu:5"})
device_id=0
import re
import time
@@ -17,9 +18,9 @@ from docx.opc.oxml import parse_xml
import logging
import logging.config
import requests
-from myLogger import outLog
+# from myLogger import outLog
-outLog.logger = logging.getLogger("checkRepeatText")
+# outLog.logger = logging.getLogger("checkRepeatText")
userLog=None
def load_from_xml_v2(baseURI, rels_item_xml):
"""
@@ -79,11 +80,10 @@ def isTitle(paragraph):
#寻找标题名称
def findTitleName(docxPath):
- yield '文档相似性检查----检查是否存在详细设计方案'
loopCount = 0
while True:
loopCount+=1
- if(loopCount>=15):
+ if(loopCount>=60):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
@@ -95,9 +95,19 @@ def findTitleName(docxPath):
# 逐段读取docx文档的内容
titleWords=[]
firstTitle = 0
+ firstTitleName=""
secondTitle = 0
sanjiTitle = 0
+ levelText=""
+ count = 0
+ numid =0
+ wordContent={}
+ total = len(document.paragraphs)
+ addStart = False#是否重新添加
+ yield "文档相似性检查----文档内容解析中",str(count),str(total)
for paragraph in document.paragraphs:
+ count+=1
+ yield "文档相似性检查----文档内容解析中",str(count),str(total)
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
@@ -109,6 +119,8 @@ def findTitleName(docxPath):
if(text.find("附件")>=0):
continue
titleWords.append("一级标题:".format(firstTitle)+text)
+ addStart=True
+ firstTitleName=text
elif level=="1":
secondTitle+=1
sanjiTitle=0
@@ -118,15 +130,28 @@ def findTitleName(docxPath):
sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
+ ##先判断是不是一级标题
+ if addStart:
+ wordContent[firstTitleName]=[]
+ addStart=False
+ if level:
+ levelText=f"{int(level)+1}级标题-"+text
+ else:
+ if(text.startswith("图") or text.startswith("注:")):
+ continue
+ if (len(text)>30 and firstTitleName):
+ numid+=1
+ wordContent[firstTitleName].append("{}:".format(levelText)+text)
findTitleName_llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
+ yield '文档相似性检查----检查是否存在详细设计方案'
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
name='Assistant',
- # system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
+ system_message='按照要求选择最合适的,是唯一的'
)
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择
@@ -142,60 +167,78 @@ def findTitleName(docxPath):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
- if(parsed_data["answer"]=="存在"):
- yield parsed_data["name"]
- else:
- yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
+ try:
+ if(parsed_data["answer"]=="存在"):
+ yield parsed_data["name"],wordContent
+ else:
+ yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
+ except Exception as e:
+ userLog.warning(e)
+ userLog.warning(data)
+ userLog.warning(parsed_data)
+ yield "文档相似性检查----检查遇到问题,请联系管理员"
#获取文档中 详细设计方案 章节的所有内容
-def getDocxToText(docxPath,titleName,vector_store_path):
- loopCount = 0
- while True:
- loopCount+=1
- if(loopCount>=15):
- raise Exception("文档读取超时,或文档存在问题无法读取")
- break
- try:
- document = Document(docxPath)
- break
- except Exception as e:
- time.sleep(1)
- pass
- # 逐段读取docx文档的内容
- levelList=[]
+# def getDocxToText(docxPath,titleName,vector_store_path):
+def getDocxToText(titleName,wordContent,vector_store_path):
+
+ # loopCount = 0
+ # while True:
+ # loopCount+=1
+ # if(loopCount>=15):
+ # raise Exception("文档读取超时,或文档存在问题无法读取")
+ # break
+ # try:
+ # document = Document(docxPath)
+ # break
+ # except Exception as e:
+ # time.sleep(1)
+ # pass
+ # # 逐段读取docx文档的内容
+ # levelList=[]
words=[]
- addStart = False
- levelText=""
- i = 0
- for paragraph in document.paragraphs:
- # 判断该段落的标题级别
- # 这里用isTitle()临时代表,具体见下文介绍的方法
- text = paragraph.text
- if text.strip():#非空判断
- if titleName:
- level = isTitle(paragraph)
- if(addStart and level=="0"):
- addStart=False
- if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)):
- addStart=True
- if level:
- levelList.append("{}:".format(level)+paragraph.text)
- levelText=f"{int(level)+1}级标题-"+text
- else:
- if addStart:
- if(text.startswith("图") or text.startswith("注:")):
- continue
- if(len(text)>30):
- i=i+1
- words.append("{}:".format(levelText)+text)
+ # addStart = False
+ # levelText=""
+ # i = 0
+ # count = 0
+ # total = len(document.paragraphs)
+ # yield "文档相似性检查----文档内容解析中",count,total
+ # for paragraph in document.paragraphs:
+ # count+=1
+ # yield "文档相似性检查----文档内容解析中",count,total
+ # # 判断该段落的标题级别
+ # # 这里用isTitle()临时代表,具体见下文介绍的方法
+ # text = paragraph.text
+ # if text.strip():#非空判断
+ # if titleName:
+ # level = isTitle(paragraph)
+ # if(addStart and level=="0"):
+ # addStart=False
+ # if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)):
+ # addStart=True
+ # if level:
+ # levelList.append("{}:".format(level)+paragraph.text)
+ # levelText=f"{int(level)+1}级标题-"+text
+ # else:
+ # if addStart:
+ # if(text.startswith("图") or text.startswith("注:")):
+ # continue
+ # if(len(text)>30):
+ # i=i+1
+ # words.append("{}:".format(levelText)+text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
+ # 遍历字典,查找包含 "标题的" 的键
+ for key, value in wordContent.items():
+ if (titleName.find(key)>=0 or key.find(titleName)>=0):
+ words.extend(value) # 将对应的值添加
if len(words)==0:
raise Exception("checkRepeatText,获取长度为0")
text = '\n'.join(words)
-
+ userLog.info(f"文档相似性检查----需要处理的总数是{len(words)}")
# 将文本写入txt文件
with open("checkRepeatText.txt", 'w', ) as txt_file:
txt_file.write(text)
- time.sleep(3)
+ time.sleep(1)
+ yield "文档相似性检查----文档内容转换中",".","."
loader = TextLoader(file_path='checkRepeatText.txt')
docs = loader.load()
# print(docs)
@@ -204,44 +247,56 @@ def getDocxToText(docxPath,titleName,vector_store_path):
splits = text_splitter.split_documents(docs)
uuids = []
+ yield "文档相似性检查----文档保存中",".","."
+ global embeddings
+ vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
for i in range(len(splits)):
- uuids.append(str(uuid.uuid4()))
+ uuidStr=str(uuid.uuid4())
+ uuids.append(uuidStr)
logging.info(f"checkRepeatTextuuidLen{len(uuids)}")
- vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
vectorstore.add_documents(documents=splits, ids=uuids)
+ yield "文档相似性检查----校验文档是否已经完成保存",".","."
while True:
time.sleep(0.3)
ress = vectorstore.similarity_search(words[0])
if (len(ress) > 0):
break
- return words,uuids,vectorstore
+ yield words,uuids,vectorstore
# @app.route('/checkRepeatText/', methods=['GET'])
-def checkRepeatText(filename,user_id):
+def checkRepeatText(filename,user_id,outLog):
global userLog
userLog=outLog.get_queue(user_id,"checkRepeatText")
yield "文档相似性检查---启动中...."
+ userLog.info("文档相似性检查---任务开始")
vector_store_path="vector_store"+str(uuid.uuid4())
for titleName in findTitleName(filename):
- yield titleName
- if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"):
+ if(isinstance(titleName ,tuple)):
+ if(len(titleName)==3):
+ yield titleName[0]+titleName[1]+"/"+titleName[2]
+ else:
+ yield titleName
+ if(isinstance(titleName ,tuple)):
+ # try:
+ yield "文档相似性检查----文档内容转换中"
try:
- yield "文档相似性检查----文档内容解析中"
- words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path)
+ for words,uuids,vectorstore in getDocxToText(titleName[0],titleName[1],vector_store_path):
+ if isinstance(words, str):
+ yield words+uuids+vectorstore
except Exception as e:
- yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败"
+ yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
userLog.warning(e)
userLog.warning(f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败")
outLog.mark_done(user_id, "checkRepeatText")
return
- # 记录程序开始的时间戳‘
+ # 记录程序开始的时间戳‘
reslist = []
count = 0
for i in words:
count += 1
- yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}"
+ yield f"文档相似性检查--对{titleName[0]}章节,进行文档内容检查中{count}/{len(words)}"
result = vectorstore.similarity_search(i)
textTag = i.split(":")[0]
for content in result:
@@ -259,6 +314,7 @@ def checkRepeatText(filename,user_id):
}
r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text)
+ res=res["data"]
# res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]])
except Exception as e:
userLog.warning("文档相似性检查--发生异常:")
@@ -266,7 +322,7 @@ def checkRepeatText(filename,user_id):
userLog.warning(i)
userLog.warning(text)
continue
- if (res["result"][0]["similarity"] > 0.90):
+ if (res[0]["similarity"] >= 0.96):
# 判断重复内容是否被放入
if (len(reslist) > 0):
isExist = False
@@ -276,15 +332,15 @@ def checkRepeatText(filename,user_id):
break
if not isExist:
# reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]})
- userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】")
- reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
+ userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res[0]["similarity"])+"】")
+ reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
else:
- reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
+ reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
# print(i.split(":")[1] + "\n" + text.split(":")[1])
- userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】")
+ userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res[0]["similarity"])+"】")
# vectorstore.delete(ids=uuids)
shutil.rmtree(vector_store_path)
- resInfo=f"对{titleName}章节,发现相似内容:
"
+ resInfo=f"对{titleName[0]}章节,发现相似内容:
"
if(len(reslist)>0):
for res in reslist:
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"
在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"
以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】
"
diff --git a/checkTitleName.py b/checkTitleName.py
index 7a0c25b..d2eee5f 100644
--- a/checkTitleName.py
+++ b/checkTitleName.py
@@ -8,7 +8,9 @@ import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
-from myLogger import outLog
+
+
+# from myLogger import outLog
def load_from_xml_v2(baseURI, rels_item_xml):
"""
@@ -29,11 +31,11 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
-outLog.logger = logging.getLogger("checkTitleName")
-userLog=None
+# outLog.logger = logging.getLogger("checkTitleName")
+userLog = None
llm_cfg = {
- #'model': 'qwen1.5-72b-chat',
- 'model':"qwen2-72b-instruct",
+ # 'model': 'qwen1.5-72b-chat',
+ 'model': "qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
@@ -81,12 +83,13 @@ def isTitle(paragraph):
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
-#获取文档中 详细设计方案 章节的所有内容
+
+# 获取文档中 详细设计方案 章节的所有内容
def getDocxToTitleName(docxPath):
loopCount = 0
while True:
- loopCount+=1
- if(loopCount>=15):
+ loopCount += 1
+ if (loopCount >= 60):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
@@ -96,64 +99,72 @@ def getDocxToTitleName(docxPath):
time.sleep(1)
pass
# 逐段读取docx文档的内容
- levelList=[]
- words=[]
+ levelList = []
+ words = []
addStart = False
- levelText=""
- i = 0
+ levelText = ""
+ count = 0
+ total = len(document.paragraphs)
+ yield f"文档结构检查----文档内容解析中{str(count)}/{str(total)}"
for paragraph in document.paragraphs:
+ count += 1
+ yield f"文档结构检查----文档内容解析中{str(count)}/{str(total)}"
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
- if text.strip():#非空判断
+ if text.strip(): # 非空判断
level = isTitle(paragraph)
- if level=="0":
+ if level == "0":
words.append(text)
- return words
+ yield words
-def checkTitleName(filename,user_id):
+
+def checkTitleName(filename, user_id, outLog):
global userLog
- userLog=outLog.get_queue(user_id,"checkTitleName")
+ userLog = outLog.get_queue(user_id, "checkTitleName")
yield '文档结构检查----启动中'
- userLog.info("checkTitleName----启动中")
- with open("ce模板.txt", "r",encoding='utf-8') as f:
+ userLog.info("文档结构检查---任务开始")
+ with open("ce模板.txt", "r", encoding='utf-8') as f:
gettext = f.readlines()
- count=0
+ count = 0
reserr = []
try:
- word = getDocxToTitleName(filename)
+ for i in getDocxToTitleName(filename):
+ word = i
+ if (isinstance(word, str)):
+ yield word
+ continue
except Exception as e:
userLog.warning(e)
- yield "文档结构检查----文档无法打开,请检查文档内容"
- outLog.mark_done(user_id, "checkTitleName")
+ yield "文档结构检查----文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
userLog.warning("checkTitleName----文档无法打开,请检查文档内容")
+ outLog.mark_done(user_id, "checkTitleName")
return
for text in gettext:
- count+=1
+ count += 1
prompt = f'''
\n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在
'''
- xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
+ xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
- userLog.info(f"checkTitleName----结构分析中{count}/{len(gettext)}")
- strword = "\n".join(word)+prompt+xushang
- messages = [{'role': 'user', 'content': [{'text':strword}]}]
+ strword = "\n".join(word) + prompt + xushang
+ messages = [{'role': 'user', 'content': [{'text': strword}]}]
runList = []
for rsp in bot.run(messages):
runList.append(rsp)
# print(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
- if(parsed_data["answer"]=="不存在"):
+ if (parsed_data["answer"] == "不存在"):
reserr.append(text)
-
- resInfo="文档结构存在异常:
"
- if(len(reserr)>0):
+ userLog.info("文档结构检查----文档结构存在异常:" + text.replace('\n', ''))
+ resInfo = "文档结构存在异常:
"
+ if (len(reserr) > 0):
for i in reserr:
- resInfo+="**"+i.replace('\n','')+"**
"
- userLog.info(resInfo)
+ resInfo += "**" + i.replace('\n', '') + "**
"
+
yield resInfo
else:
- yield "文档结构未发现异常"
- userLog.info("文档结构未发现异常")
- outLog.mark_done(user_id, "checkTitleName")
+ yield "**文档结构未发现异常**"
+ userLog.info("文档结构检查----文档结构未发现异常")
+ outLog.mark_done(user_id, "checkTitleName")
diff --git a/daijian方案.py b/daijian方案.py
index 19badae..5210e54 100644
--- a/daijian方案.py
+++ b/daijian方案.py
@@ -1,11 +1,24 @@
-from docx import Document
-from pprint import pprint
+import uuid
+from langchain_community.embeddings import DashScopeEmbeddings
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
from qwen_agent.agents import Assistant
-import re
import json_repair
-import math
+import json
+embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
+device_id=0
+import re
+import time
+from docx import Document
+import shutil
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
+import logging
+import logging.config
+import requests
+from collections import defaultdict
+
+userLog=None
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@@ -23,17 +36,6 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
-llm_cfg = {
- #'model': 'qwen1.5-72b-chat',
- 'model':"qwen2-72b-instruct",
- 'model_server': 'DashScope', # base_url, also known as api_base
- 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
-}
-bot = Assistant(llm=llm_cfg,
- name='Assistant',
- )
-
-
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
@@ -73,15 +75,26 @@ def isTitle(paragraph):
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
-#获取文档中 详细设计方案 章节的所有内容
-def getDocxToTitleName(docxPath):
- document = Document(docxPath)
+#寻找标题名称
+def findTitleName(docxPath):
+ yield '文档相似性检查----检查是否存在详细设计方案'
+ loopCount = 0
+ while True:
+ loopCount+=1
+ if(loopCount>=15):
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
# 逐段读取docx文档的内容
- levelList=[]
- words=[]
- addStart = False
- levelText=""
- i = 0
+ titleWords=[]
+ firstTitle = 0
+ secondTitle = 0
+ sanjiTitle = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
@@ -89,88 +102,360 @@ def getDocxToTitleName(docxPath):
if text.strip():#非空判断
level = isTitle(paragraph)
if level=="0":
- words.append(text)
- return words
-
-def checkTitleName(filename):
- prompt = f'''
- \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在
- '''
- xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
- yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
- strword = "\n".join(word) + prompt + xushang
- # print(strword)
- messages = [{'role': 'user', 'content': [{'text': strword}]}]
- runList = []
- cishu = 0
- for rsp in bot.run(messages):
+ firstTitle+=1
+ secondTitle = 0
+ if(text.find("附件")>=0):
+ continue
+ titleWords.append("一级标题:".format(firstTitle)+text)
+ elif level=="1":
+ secondTitle+=1
+ sanjiTitle=0
+ # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
+ # titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
+ elif level=="2":
+ sanjiTitle += 1
+ # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
+ # titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
+ findTitleName_llm_cfg = {
+ #'model': 'qwen1.5-72b-chat',
+ 'model':"qwen2-72b",
+ 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
+ # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
+ }
+ findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
+ name='Assistant',
+ # system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
+ )
+ prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
+ 类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择
+ 类似设计方案,服务方案,建设方案为次相关,次级选择
+ 类似方案是最后选择
+ 按照这样的顺序选择最合适的
+ 你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答
+ '''
+ # print("\n".join(titleWords)+prompt)
+ messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
+ runList=[]
+ for rsp in findTitleName_bot.run(messages):
runList.append(rsp)
- # print(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
- print(parsed_data)
- # yield '文档结构检查----启动中'
- # with open("ce模板.txt", "r",encoding='utf-8') as f:
- # gettext = f.readlines()
- # count=0
- # reserr = []
- # try:
- # word = getDocxToTitleName(filename)
- # except Exception as e:
- # print(e)
- # yield "文档无法打开,请检查文档内容"
- # return
- # for text in gettext:
- # count+=1
- # prompt = f'''
- # \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在
- # '''
- # xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
- # yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
- # strword = "\n".join(word)+prompt+xushang
- # # print(strword)
- # messages = [{'role': 'user', 'content': [{'text':strword}]}]
- # runList = []
- # cishu = 0
- # for rsp in bot.run(messages):
- # runList.append(rsp)
- # # print(rsp)
- # data = runList[len(runList) - 1][0]["content"]
- # parsed_data = json_repair.loads(data.replace('`', ''))
- # print(parsed_data)
- # if(parsed_data["answer"]=="不存在"):
- # reserr.append(text)
- # resInfo="文档结构存在异常:
"
- # if(len(reserr)>0):
- # for i in reserr:
- # resInfo+=f"**{i}**
"
- # yield resInfo
- # else:
- # yield "文档结构未发现异常"
+ if(parsed_data["answer"]=="存在"):
+ yield parsed_data["name"]
+ else:
+ yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
+def merge_chapters(words):
+ merged_text = {}
+ for line in words:
+ if ":" in line:
+ key, value = line.split(":", 1) # 根据第一个冒号分割
+ if key in merged_text:
+ merged_text[key].append(value.strip()) # 添加到列表
+ else:
+ merged_text[key] = [value.strip()] # 初始化列表
+ else:
+ logging.warning(f"Skipping line without key-value pair: {line}")
-import logging
+ # 合并结果格式化为列表输出
+ merged_words = []
+ for key, values in merged_text.items():
+ combined_value = ",".join(values) # 将内容合并
+ merged_words.append(f"{key}:{combined_value}")
+ return merged_words
+#获取文档中 详细设计方案 章节的所有内容
+def getDocxToText(docxPath, titleName, vector_store_path):
+ loopCount = 0
+ while True:
+ loopCount += 1
+ if loopCount >= 15:
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
+
+ # 逐段读取docx文档的内容
+ levelList = []
+ words = []
+ addStart = False
+ title_counter = [] # 用于存储当前标题的计数
+ title_texts = [] # 用于存储当前各级标题的文本
+ i = 0
+
+ for paragraph in document.paragraphs:
+ text = paragraph.text.strip()
+ if text: # 非空判断
+ level = isTitle(paragraph) # 确保这个函数在代码中定义
+
+ # 当前标题的层级
+ current_level = int(level) if level is not None else -1
+
+ if current_level >= 0: # 标题段落
+ # 确保标题计数器足够长
+ while len(title_counter) <= current_level:
+ title_counter.append(0) # 初始化新级别的标题计数
+ title_texts.append('') # 初始化对应的标题文本
+
+ # 更新当前级别及以下的标题计数和标题文本
+ title_counter[current_level] += 1 # 当前级别计数加1
+ title_counter = title_counter[:current_level+1]
+ title_texts[current_level] = text # 保存当前级别的标题文本
+ title_texts = title_texts[:current_level+1]
+
+ # 重置更低级别的计数和标题文本
+ for idx in range(current_level + 1, len(title_counter)):
+ title_counter[idx] = 0
+ title_texts[idx] = ''
+
+ # 检查是否与 titleName 匹配
+ if current_level == 0:
+ addStart = titleName in text # 检查是否与 titleName 匹配
+
+ else: # 非标题段落
+ if addStart:
+ if len(text) > 30: # 仅记录长度大于30的内容
+ i += 1
+ # 获取当前完整的标题编号和标题名称
+ levelText = ".".join(map(str, title_counter))
+ # 使用非空的标题名称
+ current_title = title_texts[-1] if title_texts else ''
+ words.append(f"{levelText}-{current_title}:{text}")
+
+ if len(words) == 0:
+ raise Exception("checkRepeatText,获取长度为0")
+
+ # 使用封装的合并函数
+ merged_words = merge_chapters(words)
+
+ # 将合并后的内容写入 txt 文件
+ with open("checkRepeatText.txt", 'w') as txt_file:
+ for line in merged_words:
+ txt_file.write(f"{line}\n")
+
+ time.sleep(3)
+
+ # 加载文本
+ loader = TextLoader(file_path='checkRepeatText.txt')
+ docs = loader.load()
+
+ # 创建唯一标识符
+ uuids = []
+ for _ in range(len(merged_words)):
+ uuids.append(str(uuid.uuid4()))
+ logging.info(f"checkRepeatTextuuidLen{len(uuids)}")
+
+ return merged_words, uuids
+
+
+# @app.route('/checkRepeatText/', methods=['GET'])
+def checkRepeatText(filename):
+ yield "文档相似性检查---启动中...."
+ vector_store_path="vector_store"+str(uuid.uuid4())
+ for titleName in findTitleName(filename):
+ yield titleName
+ if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"):
+ yield "文档相似性检查----文档内容解析中"
+ words,uuids=getDocxToText(filename,titleName,vector_store_path)
+ # 记录程序开始的时间戳‘
+ reslist = []
+ count = 0
+ standard = {
+ "清晰性": """对软件功能描述的完整性主要体现在以下两个方面:
+ a. 功能描述是否简洁明了,避免使用过于复杂或专业的术语,使得用户能够轻松理解。
+ b. 是否明确指出了功能的具体作用,没有模糊不清或含糊其辞的表述。
+ 如果要将软件功能描述的清晰性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么?
+ 将软件功能描述的清晰性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义:
+ 优秀(90~100分)
+ 简洁明了:功能描述极其精炼,没有多余的词汇,每个字都承载着必要的信息。
+ 通俗易懂:完全避免了专业术语或行业黑话,即使是非专业用户也能轻松理解。
+ 具体明确:功能的作用、范围、限制以及用户期望的结果都被清晰、准确地阐述,没有任何模糊或含糊的表述。
+ 良好(70分~90分,不包含90分)
+ 较为简洁:功能描述相对简短,但可能包含一些必要的细节或背景信息。
+ 易于理解:大部分术语都是通俗易懂的,对于少数专业术语,提供了简短的解释或上下文。
+ 明确具体:功能的主要作用、范围和用户期望的结果都被明确阐述,但可能在某些细节上稍显模糊。
+ 一般(60~70分,不包含70分)
+ 稍显冗长:功能描述可能包含一些不必要的细节或重复信息,导致用户需要花费更多时间来理解。
+ 有一定难度:使用了一些专业术语或行业黑话,但没有提供足够的解释或上下文,导致非专业用户可能难以理解。
+ 基本明确:功能的主要作用被阐述,但在范围、限制或用户期望的结果上可能存在一些模糊或含糊的表述。
+ 差(60分以下,不包含60分)
+ 冗长复杂:功能描述过于详细和复杂,包含大量不必要的细节和背景信息,导致用户难以抓住重点。
+ 难以理解:大量使用专业术语或行业黑话,且没有提供任何解释或上下文,使得大部分用户都难以理解。
+ 模糊不清:功能的作用、范围、限制以及用户期望的结果都没有被明确阐述,存在大量的模糊和含糊表述。
+ 评估的提示词举例:
+ 根据这些评判标准,对下面的软件功能描述的清晰性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价,并给出具体得分。并在此基础上润色和完善,使之达到优秀的等级。
+ """,
+ "完整性": """对软件功能描述的完整性主要体现在以下两个方面:
+ a. 是否涵盖了功能的所有重要方面,包括输入、输出、处理过程等。
+ b. 是否提供了足够的信息,以便用户能够全面了解功能的工作原理和用途。
+ 如果要将软件功能描述的完整性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么?
+ 将软件功能描述的完整性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义:
+ 优秀:(90~100分)
+ 描述全面涵盖了功能的所有重要方面,包括但不限于输入、输出、处理过程、异常处理等。
+ 提供了详尽的信息,用户能够清晰地了解功能的工作原理、用途以及在不同场景下的表现。
+ 包含了必要的示例、图表或流程图,以直观展示功能的工作流程和效果。
+ 没有遗漏任何对用户理解和使用功能至关重要的信息。
+ 良好:(70分~90分,不包含90分)
+ 描述基本涵盖了功能的主要方面,但可能有个别不太重要的细节未提及。
+ 提供了足够的信息,用户能够较好地理解功能的工作原理和用途,但在某些复杂场景下可能需要额外说明。
+ 可能包含一些示例或图表,但可能不如优秀等级那么全面或详细。
+ 一般:(60~70分,不包含70分)
+ 描述涵盖了功能的一部分重要方面,但存在较明显的遗漏或不足。
+ 提供的信息有限,用户可能只能对功能有一个大致的了解,无法深入了解其工作原理和详细用途。
+ 可能缺乏示例、图表或流程图等辅助材料,导致用户难以理解功能的某些复杂部分。
+ 差:(60分以下,不包含60分)
+ 描述严重缺失,未涵盖功能的关键方面,甚至可能误导用户。
+ 提供的信息极少,用户无法全面了解功能的工作原理和用途。
+ 可能存在错误或矛盾的信息,导致用户无法准确理解功能。
+ 根据这些评判标准,对下面的软件功能描述的完整性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价。并在此基础上润色和完善,使之达到优秀的等级。
+ """,
+ "可测试性": """软件功能描述的可测试性主要体现为以下方面:
+ a. 功能描述是否具体、明确,以便能够进行功能测试和验证。
+ b. 是否提供了足够的细节,以便开发人员和测试人员能够准确理解和实现功能。
+ 如果要将软件功能描述的可测试性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么?
+ 将软件功能描述的可测试性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义:
+ 优秀:(90~100分)
+ 功能描述非常具体和明确,能够直接转化为测试用例。
+ 提供了详尽的细节,包括输入、输出、边界条件、异常处理等。
+ 开发人员和测试人员能够轻松理解和实现功能,无需额外澄清或假设。
+ 功能描述中包含了预期的行为和非预期的行为,有助于全面覆盖测试场景。
+ 良好:(70分~90分,不包含90分)
+ 功能描述相对具体和明确,大部分内容可以直接用于测试。
+ 提供了足够的细节,但可能需要一些额外的解释或澄清才能完全理解。
+ 开发人员和测试人员能够基于描述实现和测试功能,但可能需要一些额外的沟通和协调。
+ 功能描述中基本涵盖了主要的行为和边界条件,但可能缺少对某些异常情况的详细描述。
+ 一般:(60~70分,不包含70分)
+ 功能描述较为笼统,需要较多的解释和澄清才能用于测试和开发。
+ 细节不够充分,可能导致开发人员和测试人员在实现和测试过程中产生误解或遗漏。
+ 需要较多的沟通和协调来确保功能的正确实现和测试。
+ 功能描述中可能只涵盖了主要的行为,对边界条件和异常情况的描述较为模糊或缺失。
+ 差:(60分以下,不包含60分)
+ 功能描述非常模糊和笼统,无法直接用于测试和开发。
+ 缺乏必要的细节,导致开发人员和测试人员无法准确理解和实现功能。
+ 需要大量的沟通和协调,甚至可能需要重新编写功能描述才能进行有效的测试和开发。
+ 功能描述中可能只提到了大致的目标或意图,没有具体的行为描述、边界条件或异常处理。
+ 根据这些评判标准,对下面的软件功能描述的可测试性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价。并在此基础上润色和完善,使之达到优秀的等级。
+ """,
+ "详细性": """软件功能详细性主要体现在:
+ a. 功能描述是否详细,可以根据功能描述进行功能点评价,计算出ILF、EIF、EI、EO、EQ的数量;
+ 如果要将软件功能描述的详细性划分为优秀、良好、一般、差四个从高到低的等级,每个等级的评判标准是什么?
+ 将软件功能描述的详细性划分为优秀、良好、一般、差四个等级时,每个等级的评判标准可以如下定义:
+ 优秀:(90~100分)
+ 功能描述非常详尽,包含了所有必要的信息,使得评估者能够轻松地根据描述进行功能点评价。
+ ILF、EIF、EI、EO、EQ的数量可以明确且无误地计算出来,没有遗漏或模糊之处。
+ 描述中不仅包含了功能的正常操作,还涵盖了异常处理、边界条件等特殊情况。
+ 使用了具体的例子、流程图或伪代码来进一步阐明功能。
+ 良好:(70分~90分,不包含90分)
+ 功能描述相对详细,提供了足够的信息来进行功能点评价。
+ ILF、EIF、EI、EO、EQ的数量可以大致计算出来,但可能需要一些额外的解释或澄清。
+ 描述中基本涵盖了功能的各个方面,但对某些细节或特殊情况可能描述不够充分。
+ 整体而言,描述是清晰和准确的,但还有改进的空间。
+ 一般:(60~70分,不包含70分)
+ 功能描述较为笼统,缺乏具体的细节。
+ ILF、EIF、EI、EO、EQ的数量计算可能存在一定的困难或不确定性,需要较多的假设或推测。
+ 描述中只涵盖了功能的主要方面,对细节和特殊情况的处理描述不足。
+ 可能需要额外的沟通或澄清才能准确理解功能需求。
+ 差:(60分以下,不包含60分)
+ 功能描述非常模糊,缺乏必要的信息和细节。
+ 无法根据描述进行准确的功能点评价,ILF、EIF、EI、EO、EQ的数量无法确定。
+ 描述中可能只提到了功能的大致目标或意图,没有具体的实现细节或操作步骤。
+ 需要大量的额外信息或澄清才能理解功能需求,甚至可能需要重新编写功能描述。
+ 根据这些评判标准,对下面的软件功能描述的详细性进行客观的评价,给出优秀、良好、一般、差四个等级之一的评价。并在此基础上润色和完善,使之达到优秀的等级。
+ """,
+ }
+ weight = {
+ "清晰性" : 0.4,
+ "完整性" : 0.3,
+ "可测试性" : 0.2,
+ "详细性" : 0.1,
+
+ }
+
+ findTitleName_llm_cfg = {
+ 'model': "qwen2-72b",
+ 'model_server': 'http://127.0.0.1:1025/v1',
+ }
+ findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, name='Assistant')
+ for i in words:
+ count += 1
+ yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}"
+ chapter, rest = i.split('-', 1)
+ title, text = rest.split(':', 1)
+
+ # 生成字典
+ example = {
+ "chapter": chapter.strip(),
+ "title": title.strip(),
+ "text": text.strip()
+ }
+ result = {
+ "title": title.strip(),
+ "text": text.strip()
+ }
+ # 循环提取键和值
+ weighted_score = 0
+ for key, value in standard.items():
+ prompt_score = f"""对软件功能{key}的定义:
+ {value}
+ 模块名称:【{example['title']}】
+ 模块描述:【{example['text']}】
+ 回答格式为:{{"模块名称":"{example['text']}",
+ "等级":"优秀/良好/一般/差",
+ "得分":"0~100",
+ "理由及扣分原因":"理由及扣分原因",
+ }},不做过多的解释,严格按回答格式作答,只给出一个回答。
+ """
+
+ messages = [({'role': 'user', 'content': prompt_score})]
+ runList = []
+ for rsp in findTitleName_bot.run(messages):
+ runList.append(rsp)
+ data = runList[len(runList) - 1][0]["content"]
+ parsed_data = json_repair.loads(data.replace('`', ''))
+ if isinstance(parsed_data, list): # 检查parsed_data是否为列表
+ parsed_data = parsed_data[0] # 取第一个元素
+ else:
+ parsed_data = parsed_data
+ result[f"{key}等级"] = parsed_data['等级']
+ result[f"{key}得分"] = parsed_data['得分']
+ score = int(parsed_data['得分']) # 假设 '得分' 是字符串,需要转换为整数
+ key_weight = weight.get(key, 0) # 根据键获取权重,如果没有匹配的权重,默认为 0
+ # 计算加权得分并累加
+ weighted_score += score * key_weight
+ result["加权得分"] = round(weighted_score, 2) # 保留两位小数
+ answer = f"{example['text']}"
+ for key, value in standard.items():
+ prompt_answer = f"""对软件功能{key}的定义:\n
+ {value}\n
+ 模块名称:【{example['title']}】\n
+ 模块描述:f【{answer}】\n
+ 回答格式为:{{"模块名称":"{example['text']}",
+ "改进后的描述":"改进后的描述",
+ }},不做过多的解释,严格按回答格式作答。
+ """
+ messages = [({'role': 'user', 'content': prompt_answer})]
+ runList = []
+ for rsp in findTitleName_bot.run(messages):
+ runList.append(rsp)
+ data = runList[len(runList) - 1][0]["content"]
+ parsed_data = json_repair.loads(data.replace('`', ''))
+ answer = parsed_data['改进后的描述']
+ result["改进后的描述"] = answer
+ textTag = i.split(":")[0]
+ breakpoint()
+ # vectorstore.delete(ids=uuids)
+ shutil.rmtree(vector_store_path)
+ resInfo=f"对{titleName}章节,发现相似内容:
"
+ if(len(reslist)>0):
+ for res in reslist:
+ resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"
在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"
以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】
"
+ yield resInfo
+ else:
+ yield "**未发现相似内容**"
+ userLog.info("文档相似性检查----未发现相似内容**")
-# 创建一个记录器
-logger = logging.getLogger('my_logger')
-logger.setLevel(logging.DEBUG)
-
-# 创建一个处理器
-ch = logging.StreamHandler()
-ch.setLevel(logging.DEBUG)
-
-# 创建一个格式化器并将其添加到处理器中
-formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-ch.setFormatter(formatter)
-
-# 将处理器添加到记录器中
-logger.addHandler(ch)
-try:
-# 记录一些日志消息
- logger.debug('这是一个调试消息')
- logger.info('这是一个信息消息')
- logger.warning('这是一个警告消息')
- logger.error('这是一个错误消息')
- logger.critical('这是一个致命错误消息')
-except Exception as e:
- logger.warning(e)
\ No newline at end of file
+for i in checkRepeatText("./北仑区综合行政执法局协同监管系统项目建设方案_20240824.docx"):
+ print(i)
diff --git a/main.py b/main.py
index 8e89845..9a11197 100644
--- a/main.py
+++ b/main.py
@@ -1,206 +1,286 @@
-from flask import Flask, request, jsonify, Response
+# from flask import Flask, request, jsonify, Response
import os
from checkPlaceName import checkPlaceName
from checkRepeatText import checkRepeatText
from checkCompanyName import checkCompanyName
from checkDocumentError import checkDocumentError
from checkTitleName import checkTitleName
-from flask_cors import CORS
+# from flask_cors import CORS
import qwen_agenttext
from myLogger import outLog
import time
-app = Flask(__name__)
-cros = CORS(app)
+# app = Flask(__name__)
+# cros = CORS(app)
+import uvicorn
+from fastapi import FastAPI, Request, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from sse_starlette.sse import EventSourceResponse
+import asyncio
+
+app = FastAPI()
+# 允许所有来源的跨域请求
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"]
+)
+
UPLOAD_FOLDER = 'uploads'
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
-@app.route('/upload', methods=['POST'])
-def upload_file():
- if 'file' not in request.files:
- return jsonify({"error": "No file part"}), 400
- file = request.files['file']
- if file.filename == '':
- return jsonify({"error": "No selected file"}), 400
- if file:
- filename = file.filename
- file.save(os.path.join(UPLOAD_FOLDER, filename))
- return jsonify({"message": "File uploaded successfully"}), 200
-
-
-@app.route('/stream', methods=["GET", "POST"])
-def stream_numbers():
- context = request.args.get('context')
- # def generate_numbers():
- # event_id=0
- # for number in range(1, 10):
- # json_data = json.dumps({"number": number})
- # print(json_data)
- # event_id += 1
- # yield f"id: {event_id}\n"
- # yield f"event: time-update\n"
- # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送
- # time.sleep(0.5) # 为了演示,加入短暂延迟
- # json_data = json.dumps({"number": "done"})
- # yield f"id: {1}\n"
- # yield f"event: time-update\n"
- # yield f"data: {json_data}\n\n" # 发送完成信号
-
- headers = {
- "Content-Type": "text/event-stream",
- "Cache-Control": "no-cache",
- "X-Accel-Buffering": "no",
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Allow-Methods": "GET,POST",
- "Access-Control-Allow-Headers": "x-requested-with,content-type",
- }
- return Response(qwen_agenttext.getxinx(context), headers=headers)
-
-
-@app.route('/sse/checkRepeatText', methods=['GET'])
-def checkRepeatTextWeb():
- filename = request.args.get('filename')
- userId = request.args.get("userId")
-
- def generate_checkRepeatText(filename,userId):
+# @app.route('/upload', methods=['POST'])
+# def upload_file():
+# if 'file' not in request.files:
+# return jsonify({"error": "No file part"}), 400
+# file = request.files['file']
+# if file.filename == '':
+# return jsonify({"error": "No selected file"}), 400
+# if file:
+# filename = file.filename
+# file.save(os.path.join(UPLOAD_FOLDER, filename))
+# return jsonify({"message": "File uploaded successfully"}), 200
+@app.post("/sse/upload")
+async def upload_file(file: UploadFile = File(...)):
+ if not file.filename:
+ raise HTTPException(status_code=400, detail="No selected file")
+
+ # 保存文件
+ try:
+ file_location = os.path.join(UPLOAD_FOLDER, file.filename)
+ with open(file_location, "wb") as f:
+ content = await file.read()
+ f.write(content)
+ return JSONResponse(content={"message": "文件上传成功"}, status_code=200)
+ except Exception as e:
+ raise HTTPException(status_code=500, detail="文件上传失败,错误信息:" + str(e))
+
+
+@app.get("/sse")
+async def root(request: Request):
+ async def event_generator(request: Request):
+ res_str = "七夕情人节即将来临,我们为您准备了精美的鲜花和美味的蛋糕"
+ for i in res_str:
+ if await request.is_disconnected():
+ print("连接已中断")
+ break
+ yield {
+ "event": "message",
+ "id": "7",
+ "data": f"{i}"
+ }
+
+ await asyncio.sleep(0.1)
+
+ g = event_generator(request)
+ return EventSourceResponse(g)
+
+
+# def stream_numbers():
+# context = request.args.get('context')
+# # def generate_numbers():
+# # event_id=0
+# # for number in range(1, 10):
+# # json_data = json.dumps({"number": number})
+# # print(json_data)
+# # event_id += 1
+# # yield f"id: {event_id}\n"
+# # yield f"event: time-update\n"
+# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送
+# # time.sleep(0.5) # 为了演示,加入短暂延迟
+# # json_data = json.dumps({"number": "done"})
+# # yield f"id: {1}\n"
+# # yield f"event: time-update\n"
+# # yield f"data: {json_data}\n\n" # 发送完成信号
+
+# headers = {
+# "Content-Type": "text/event-stream",
+# "Cache-Control": "no-cache",
+# "X-Accel-Buffering": "no",
+# "Access-Control-Allow-Origin": "*",
+# "Access-Control-Allow-Methods": "GET,POST",
+# "Access-Control-Allow-Headers": "x-requested-with,content-type",
+# }
+# return Response(qwen_agenttext.getxinx(context), headers=headers)
+
+@app.get("/sse/checkRepeatText")
+async def checkRepeatTextWeb(filename, userId, request: Request):
+ async def generate_checkRepeatText(filename, userId, request: Request):
+ global outLog
id = 0
- for i in checkRepeatText(filename,userId):
- yield f"id: {id + 1}\n"
- yield f"event: checkRepeatText\n"
- yield f"data: {i}\n\n" # 发送完成信号
- # except Exception as e:
-
- # yield f"id: {id+1}\n"
- # yield f"event: checkRepeatText\n"
- # yield f"data: **程序出现异常**\n\n" # 发送完成信号
-
- headers = {
- "Content-Type": "text/event-stream",
- "Cache-Control": "no-cache",
- "X-Accel-Buffering": "no",
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Allow-Methods": "GET,POST",
- "Access-Control-Allow-Headers": "x-requested-with,content-type",
- }
- return Response(generate_checkRepeatText(filename,userId), headers=headers)
-
-
-@app.route('/sse/checkPlaceName', methods=['GET'])
-def checkPlaceNameWebSse():
- filename = request.args.get('filename')
- userId = request.args.get("userId")
- def generate_checkPlaceName(filename,userId):
+ for i in checkRepeatText(filename, userId, outLog):
+ id += 1
+ if await request.is_disconnected():
+ yield {
+ "id": f"{id}",
+ "event": "checkRepeatText",
+ "data": "checkRepeatText连接已中断"
+ }
+ break
+ yield {
+ "id": f"{id}",
+ "event": "checkRepeatText",
+ "data": i
+ }
+
+ g = generate_checkRepeatText(filename, userId, request)
+ return EventSourceResponse(g)
+
+
+@app.get('/sse/checkPlaceName')
+def checkPlaceNameWebSse(filename, userId, request: Request):
+ async def generate_checkPlaceName(filename, userId, request: Request):
id = 0
- for i in checkPlaceName(filename,userId):
- yield f"id: {id + 1}\n"
- yield f"event: checkPlaceName\n"
- yield f"data: {i}\n\n" # 发送完成信号
-
- headers = {
- "Content-Type": "text/event-stream",
- "Cache-Control": "no-cache",
- "X-Accel-Buffering": "no",
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Allow-Methods": "GET,POST",
- "Access-Control-Allow-Headers": "x-requested-with,content-type",
- }
- return Response(generate_checkPlaceName(filename,userId), headers=headers)
-
-
-@app.route('/sse/checkCompanyName', methods=['GET'])
-def checkCompanyNameWebSse():
- filename = request.args.get('filename')
- userId = request.args.get("userId")
- def generate_checkCompanyName(filename,userId):
+ global outLog
+ for i in checkPlaceName(filename, userId, outLog):
+ id += 1
+ if await request.is_disconnected():
+ yield {
+ "id": f"{id}",
+ "event": "checkPlaceName",
+ "data": "checkPlaceName连接已中断"
+ }
+ break
+ yield {
+ "id": f"{id}",
+ "event": "checkPlaceName",
+ "data": i
+ }
+
+ g = generate_checkPlaceName(filename, userId, request)
+ return EventSourceResponse(g)
+
+
+@app.get('/sse/checkCompanyName')
+def checkCompanyNameWebSse(filename, userId, request: Request):
+ async def generate_checkCompanyName(filename, userId, request: Request):
id = 0
- for i in checkCompanyName(filename,userId):
- yield f"id: {id + 1}\n"
- yield f"event: checkCompanyName\n"
- yield f"data: {i}\n\n" # 发送完成信号
-
- headers = {
- "Content-Type": "text/event-stream",
- "Cache-Control": "no-cache",
- "X-Accel-Buffering": "no",
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Allow-Methods": "GET,POST",
- "Access-Control-Allow-Headers": "x-requested-with,content-type",
- }
- return Response(generate_checkCompanyName(filename,userId), headers=headers)
-
-
-@app.route('/sse/checkDocumentErrorWeb', methods=['GET'])
-def checkDocumentErrorWebSse():
- filename = request.args.get('filename')
- userId = request.args.get("userId")
- def generate_checkDocumentError(filename,userId):
+ global outLog
+ for i in checkCompanyName(filename, userId, outLog):
+ id += 1
+ if await request.is_disconnected():
+ yield {
+ "id": f"{id}",
+ "event": "checkCompanyName",
+ "data": "checkCompanyName连接已中断"
+ }
+ break
+ yield {
+ "id": f"{id}",
+ "event": "checkCompanyName",
+ "data": i
+ }
+
+ g = generate_checkCompanyName(filename, userId, request)
+ return EventSourceResponse(g)
+
+
+@app.get('/sse/checkDocumentErrorWeb')
+def checkDocumentErrorWebSse(filename, userId, request: Request):
+ async def generate_checkDocumentError(filename, userId, request: Request):
id = 0
- for i in checkDocumentError(filename,userId):
- yield f"id: {id + 1}\n"
- yield f"event: checkDocumentError\n"
- yield f"data: {i}\n\n" # 发送完成信号
-
- headers = {
- "Content-Type": "text/event-stream",
- "Cache-Control": "no-cache",
- "X-Accel-Buffering": "no",
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Allow-Methods": "GET,POST",
- "Access-Control-Allow-Headers": "x-requested-with,content-type",
- }
- return Response(generate_checkDocumentError(filename,userId), headers=headers)
-
-
-@app.route('/sse/checkTitleName', methods=['GET'])
-def checkTitleNameWebSse():
- filename = request.args.get('filename')
- userId = request.args.get("userId")
- def generate_checkTitleName(filename,userId):
+ global outLog
+ for i in checkDocumentError(filename, userId, outLog):
+ id += 1
+ if await request.is_disconnected():
+ yield {
+ "id": f"{id}",
+ "event": "checkDocumentError",
+ "data": "checkDocumentError连接已中断"
+ }
+ break
+ yield {
+ "id": f"{id}",
+ "event": "checkDocumentError",
+ "data": i
+ }
+
+ g = generate_checkDocumentError(filename, userId, request)
+ return EventSourceResponse(g)
+
+
+@app.get('/sse/checkTitleName')
+def checkTitleNameWebSse(filename, userId, request: Request):
+ async def generate_checkTitleName(filename, userId, request: Request):
id = 0
- for i in checkTitleName(filename,userId):
- yield f"id: {id + 1}\n"
- yield f"event: checkTitleName\n"
- yield f"data: {i}\n\n" # 发送完成信号
-
- headers = {
- "Content-Type": "text/event-stream",
- "Cache-Control": "no-cache",
- "X-Accel-Buffering": "no",
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Allow-Methods": "GET,POST",
- "Access-Control-Allow-Headers": "x-requested-with,content-type",
- }
- return Response(generate_checkTitleName(filename,userId), headers=headers)
-
-@app.route('/sse/getLog', methods=['GET'])
-def getlog():
- userId = request.args.get("userId")
- def generate_getLog(userId):
- time.sleep(1)
+ global outLog
+ for i in checkTitleName(filename, userId, outLog):
+ id += 1
+ if await request.is_disconnected():
+ yield {
+ "id": f"{id}",
+ "event": "checkTitleName",
+ "data": "checkTitleName连接已中断"
+ }
+ break
+ yield {
+ "id": f"{id}",
+ "event": "checkTitleName",
+ "data": i
+ }
+
+ g = generate_checkTitleName(filename, userId, request)
+ return EventSourceResponse(g)
+
+
+@app.get("/sse/getLog")
+# @app.route('/sse/getLog', methods=['GET'])
+async def getlog(userId, request: Request):
+ # userId = request.args.get("userId")
+ async def generate_getLog(userId):
id = 0
+ global outLog
+ await asyncio.sleep(5)
while True:
- if outLog.is_done(userId):
+ isbreak = outLog.is_done(userId)
+ if isbreak:
+ break # 完成了
+ text = outLog.get_queueData(userId)
+ if await request.is_disconnected():
+ yield {
+ "id": f"{id}",
+ "event": "checkTitleName",
+ "data": "checkTitleName连接已中断"
+ }
break
- q = outLog.get_queueData(userId)
- if q:
- id+=1
- text = q.pop(0)
- yield f"id: {id}\n"
- yield f"event: getlog\n"
- yield f"data: {text}\n\n" # 发送完成信号
- yield f"id: {id}\n"
- yield f"event: getlog\n"
- yield f"data: 任务结束!!!!!\n\n" # 发送完成信号
+ if text:
+ id += 1
+ yield {
+ "id": id,
+ "event": "getlog",
+ "data": text
+ }
+ # yield f"id: {id}\n"
+ # yield f"event: getlog\n"
+ # yield f"data: {text}\n\n" # 发送完成信号
+ # yield f"id: {id}\n"
+ # yield f"event: getlog\n"
+ # yield f"data: 任务结束!!!!!\n\n" # 发送完成信号
+ yield {
+ "id": id,
+ "event": "getlog",
+ "data": "任务结束!!!!"
+ }
outLog.del_queue(userId)
- headers = {
- "Content-Type": "text/event-stream",
- "Cache-Control": "no-cache",
- "X-Accel-Buffering": "no",
- "Access-Control-Allow-Origin": "*",
- "Access-Control-Allow-Methods": "GET,POST",
- "Access-Control-Allow-Headers": "x-requested-with,content-type",
- }
- return Response(generate_getLog(userId), headers=headers)
+
+ # headers = {
+ # "Content-Type": "text/event-stream",
+ # "Cache-Control": "no-cache",
+ # "X-Accel-Buffering": "no",
+ # "Access-Control-Allow-Origin": "*",
+ # "Access-Control-Allow-Methods": "GET,POST",
+ # "Access-Control-Allow-Headers": "x-requested-with,content-type",
+ # }
+ g = generate_getLog(userId)
+ return EventSourceResponse(g)
+ # return Response(generate_getLog(userId), headers=headers)
+
+
if __name__ == '__main__':
- app.run(host="0.0.0.0", port=80)
+ # app.run(host="0.0.0.0", port=80,threaded=True)
+ # uvicorn.run(app='main:app', host="0.0.0.0", port=80,workers=1)
+ app.run()
diff --git a/myLogger.py b/myLogger.py
index 6ea3059..7244d53 100644
--- a/myLogger.py
+++ b/myLogger.py
@@ -1,117 +1,8 @@
# -*- coding: utf-8 -*-
-"""
-@author: bingyl123@163.com
-@version: 1.0.0
-@file: OutLog.py
-@time: 2023/2/23 20:25
-"""
-# import logging
-# import logging.config
-# import re
-# import datetime
-# import queue
-#
-#
-# class OutLog:
-# _instance = None
-# logger = None
-#
-# def __new__(cls):
-# if cls._instance is None:
-# cls._instance = super(OutLog, cls).__new__(cls)
-# cls.logger = logging.getLogger("app") # 默认logger名称为"app"
-# cls._instance.queue_dict = {}
-# cls._instance.done_dict = {}
-# return cls._instance
-#
-# def get_queue(self, user_id):
-# if user_id not in self.queue_dict:
-# self.queue_dict[user_id] = []
-# self.done_dict[user_id] = {} # 初始化为未完成的字典
-# return self.queue_dict[user_id]
-#
-# def mark_done(self, user_id, producer_name):
-# self.done_dict[user_id][producer_name] = True
-#
-# def is_done(self, user_id):
-# return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
-# @staticmethod
-# def put(item: str, level="INFO"):
-# dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-# mq.put(f"{dtf}[{level}]: {item}")
-#
-# @staticmethod
-# def debug(item, log=True):
-# OutLog.put(item, level="DEBUG")
-# if log:
-# OutLog._instance.logger.debug(item)
-#
-# @staticmethod
-# def info(item, log=True):
-# OutLog.put(item, level="INFO")
-# if log:
-# OutLog._instance.logger.info(item)
-#
-# @staticmethod
-# def warning(item, log=True):
-# OutLog.put(item, level="WARNING")
-# if log:
-# OutLog._instance.logger.warning(item)
-#
-# @staticmethod
-# def error(item, log=True):
-# OutLog.put(item, level="ERROR")
-# if log:
-# OutLog._instance.logger.error(item)
-#
-# @staticmethod
-# def critical(item, log=True):
-# OutLog.put(item, level="CRITICAL")
-# if log:
-# OutLog._instance.logger.critical(item)
-#
-#
-#
-# # 日志配置
-# log_config = {
-# 'version': 1,
-# 'disable_existing_loggers': False,
-# 'formatters': {
-# 'standard': {
-# 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-# },
-# },
-# 'handlers': {
-# 'console': {
-# 'class': 'logging.StreamHandler',
-# 'formatter': 'standard',
-# 'level': logging.INFO,
-# },
-# 'file': {
-# 'class': 'logging.FileHandler',
-# 'filename': 'Logger.log',
-# 'formatter': 'standard',
-# 'level': logging.WARNING,
-# },
-# },
-# 'loggers': {
-# '': {
-# 'handlers': ['console', 'file'],
-# 'level': logging.WARNING,
-# 'propagate': True,
-# },
-# }
-# }
-#
-# logging.config.dictConfig(log_config)
-#
-# outLog = OutLog() # 获取单例实例
-
-
-
import logging
import logging.config
import datetime
+import redis
class OutLog:
_instance = None
@@ -121,35 +12,49 @@ class OutLog:
if cls._instance is None:
cls._instance = super(OutLog, cls).__new__(cls)
cls.logger = logging.getLogger("app") # 默认logger名称为"app"
- cls._instance.queue_dict = {}
- cls._instance.done_dict = {}
+ # cls._instance.queue_dict = {}
+ # cls._instance.done_dict = {}
+ # 初始化 Redis 连接
+ cls._instance.redis_client = redis.StrictRedis(host='localhost', port=6379, password="root",db=0, decode_responses=True)
return cls._instance
- def get_queue(self, user_id,producer_name):
- if user_id not in self.queue_dict:
- self.queue_dict[user_id] = []
- self.done_dict[user_id] = {} # 初始化为未完成的字典
- if user_id not in self.done_dict:
- self.done_dict[user_id][producer_name] = False
+ def get_queue(self,user_id,producer_name):
+ # if user_id not in self.queue_dict:
+ # self.queue_dict[user_id] = []
+ # self.done_dict[user_id]={}
+ # self.done_dict[user_id][producer_name] = False # 初始化为未完成的字典
+ # 使用 Redis 进行存储和查询
+ if not self.redis_client.exists(f"queue:{user_id}"):
+ # self.redis_client.rpush(f"queue:{user_id}")
+ self.logger.info(f"queue:{user_id}")
+ self.redis_client.hset(f"done:{user_id}", producer_name, "0") # 初始化为未完成
return self.UserLogger(user_id)
def get_queueData(self, user_id):
- if user_id in self.queue_dict:
- return OutLog._instance.queue_dict[self.user_id]
+ # if user_id in self.queue_dict:
+ # return self.queue_dict[user_id]
+ if self.redis_client.exists(f"queue:{user_id}"):
+ return self.redis_client.lpop(f"queue:{user_id}") # 获取队列首个并删除数据
def del_queue(self,user_id):
+ # if self.is_done(user_id):
+ # del self.queue_dict[user_id]
+ # del self.done_dict[user_id]
if self.is_done(user_id):
- del self.queue_dict[user_id]
- del self.done_dict[user_id]
+ self.redis_client.delete(f"queue:{user_id}")
+ self.redis_client.delete(f"done:{user_id}")
class UserLogger:
def __init__(self, user_id):
self.user_id = user_id
self.logger = OutLog._instance.logger
def log(self, item: str, level: str):
+ self._log_to_logger(item, level)
+ if(level != "INFO"):
+ return
dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_entry = f"{dtf}[{level}]: {item}"
- OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列
- self._log_to_logger(item, level)
-
+ # print(log_entry)
+ # OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列
+ OutLog._instance.redis_client.rpush(f"queue:{self.user_id}", log_entry) # 保存到对应用户的队列
def _log_to_logger(self, item: str, level: str):
if level == "DEBUG":
self.logger.debug(item)
@@ -177,11 +82,17 @@ class OutLog:
def critical(self, item: str):
self.log(item, "CRITICAL")
+ # def mark_done(self, user_id, producer_name):
+ # self.done_dict[user_id][producer_name] = True
+ # def is_done(self, user_id):
+ # # print(self.done_dict.get(user_id, {}),self.done_dict.get(user_id, {}).values())
+ # return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
def mark_done(self, user_id, producer_name):
- self.done_dict[user_id][producer_name] = True
+ self.redis_client.hset(f"done:{user_id}", producer_name, "1")
def is_done(self, user_id):
- return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
+ done_dict = self.redis_client.hgetall(f"done:{user_id}")
+ return all(value == "1" for value in done_dict.values()) if done_dict else False # 检查所有生产者是否完成
# 日志配置
@@ -203,13 +114,13 @@ log_config = {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
- 'level': logging.WARNING,
+ 'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
- 'level': logging.WARNING,
+ 'level': logging.INFO,
'propagate': True,
},
}