Browse Source

更新文件

master
zhouhaibin 4 months ago
parent
commit
6a406ec64e
  1. 88
      checkCompanyName.py
  2. 86
      checkDocumentError.py
  3. 11
      checkPlaceName.py
  4. 180
      checkRepeatText.py
  5. 83
      checkTitleName.py
  6. 485
      daijian方案.py
  7. 408
      main.py
  8. 169
      myLogger.py

88
checkCompanyName.py

@ -8,9 +8,10 @@ import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml from docx.opc.oxml import parse_xml
import requests import requests
from myLogger import outLog # from myLogger import outLog
import time import time
def load_from_xml_v2(baseURI, rels_item_xml): def load_from_xml_v2(baseURI, rels_item_xml):
""" """
Return |_SerializedRelationships| instance loaded with the Return |_SerializedRelationships| instance loaded with the
@ -31,9 +32,9 @@ _SerializedRelationships.load_from_xml = load_from_xml_v2
import logging import logging
outLog.logger = logging.getLogger("checkCompanyName") # outLog.logger = logging.getLogger("checkCompanyName")
userLog=None userLog = None
prompt =''' prompt = '''
.根据上述文本判断是否为具体的公司或组织名称你可以使用工具利用互联网查询 .根据上述文本判断是否为具体的公司或组织名称你可以使用工具利用互联网查询
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校行业类型其他]选项中选择答案, 你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校行业类型其他]选项中选择答案,
回答格式[{companyName名称,"回答":"答案"}{companyName名称,"回答":"答案"}]不做过多的解释,严格按回答格式作答; 回答格式[{companyName名称,"回答":"答案"}{companyName名称,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
@ -54,8 +55,8 @@ def getDocxToTextAll(name):
docxPath = name docxPath = name
loopCount = 0 loopCount = 0
while True: while True:
loopCount+=1 loopCount += 1
if(loopCount>=15): if (loopCount >= 60):
raise Exception("文档读取超时,或文档存在问题无法读取") raise Exception("文档读取超时,或文档存在问题无法读取")
break break
try: try:
@ -76,17 +77,16 @@ def getDocxToTextAll(name):
words.append(text) words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔 # 将所有段落文本拼接成一个字符串,并用换行符分隔
text = '\n'.join(words) text = '\n'.join(words)
# userLog.info("checkCompanyName----保存文件")
# 将文本写入txt文件 # 将文本写入txt文件
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file: with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text) txt_file.write(text)
def companyNameTask(text): def companyNameTask(text):
yield "文档公司或组织名称检查---启动中...." yield "文档公司或组织名称检查---文档解析中...."
userLog.info("checkCompanyName----启动中....") userLog.info("文档公司或组织名称检查---任务开始")
batchNum = 20 batchNum = 5
sentences = re.split(r'[。\n]', text) sentences = re.split(r'[、,\n]', text)
# 去掉空字符 # 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数 # 计算总字符数
@ -101,19 +101,19 @@ def companyNameTask(text):
# 打印每一份的内容 # 打印每一份的内容
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}" yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}"
userLog.info(f"checkCompanyName----文档解析进度:{i + 1}/{num_chunks}")
try: try:
wenBen = ".".join(chunk) # wenBen = ".".join(chunk)
url = "http://0.0.0.0:8191/taskflow/checkPlaceName" url = "http://0.0.0.0:8191/taskflow/checkPlaceNameServer"
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
data = { data = {
"data": { "data": {
"text": wenBen, "text": chunk,
# "text":wenBen
} }
} }
r = requests.post(url=url, headers=headers, data=json.dumps(data)) r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text) res = json.loads(r.text)
# userLog.info(res) res = res["data"]
# print(res) # print(res)
except Exception as e: except Exception as e:
userLog.warning(chunk) userLog.warning(chunk)
@ -121,44 +121,52 @@ def companyNameTask(text):
userLog.warning(e) userLog.warning(e)
return return
isplace = False isplace = False
for zuhe in res["result"]:
# for zuhe in res:
# # 上一个的地名,这一个还是地名,就和上一个相加代替这个
# if isplace:
# name = placeList[len(placeList) - 1]
# if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns"
# isplace = True
# new_text = zuhe[0].replace("\n", "")
# placeList[len(placeList) - 1] = name + new_text
# continue
# if zuhe[1].find("组织机构类") >= 0:
# isplace = True
# new_text = zuhe[0].replace("\n", "")
# placeList.append(new_text)
# else:
# isplace = False
##案例[[('目前', 'TIME'), ('江北区历史文化档案馆', 'ORG')], [('宁波国研简直,并且在东软', 'ORG'), ('宁波市北仑区教育局', 'ORG'), ('国研信息', 'ORG'), ('浙江省', 'LOC'), ('宁波市金凤区', 'LOC'), ('金凤区', 'LOC')]]
for zuhe in res:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 # 上一个的地名,这一个还是地名,就和上一个相加代替这个
if isplace: for chid in zuhe:
name = placeList[len(placeList) - 1] if (chid[1] == "ORG"):
if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns" new_text = chid[0].replace("\n", "")
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
if zuhe[1].find("组织机构类") >= 0:
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList.append(new_text) placeList.append(new_text)
else:
isplace = False
# 打印总份数 # 打印总份数
yield "文档公司或组织名称检查---文档解析完成" yield "文档公司或组织名称检查---文档解析完成"
userLog.info("checkCompanyName----文档解析完成")
placeList = list(dict.fromkeys(placeList)) placeList = list(dict.fromkeys(placeList))
userLog.debug(placeList)
yield placeList yield placeList
userLog.info(placeList)
def checkCompanyName(filename,user_id):
def checkCompanyName(filename, user_id, outLog):
yield f"文档公司或组织名称检查---开始处理文档..." yield f"文档公司或组织名称检查---开始处理文档..."
global userLog global userLog
userLog=outLog.get_queue(user_id, "checkCompanyName") userLog = outLog.get_queue(user_id, "checkCompanyName")
try: try:
getDocxToTextAll(filename) getDocxToTextAll(filename)
except Exception as e: except Exception as e:
userLog.warning(e) userLog.warning(e)
userLog.warning("文档公司或组织名称检查---文档无法打开,请检查文档内容") userLog.warning("文档公司或组织名称检查---文档无法打开,请检查文档内容")
yield "文档公司或组织名称检查---文档无法打开,请检查文档内容" yield "文档公司或组织名称检查---文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
outLog.mark_done(user_id, "checkCompanyName") outLog.mark_done(user_id, "checkCompanyName")
return return
with open("checkCompanyName.txt", "r", encoding='utf-8') as f: with open("checkCompanyName.txt", "r", encoding='utf-8') as f:
gettext = f.read() gettext = f.read()
yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送 yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送
userLog.info("checkCompanyName----开始解析文档...") final_list = ""
for item in companyNameTask(gettext): for item in companyNameTask(gettext):
if isinstance(item, str): if isinstance(item, str):
yield item yield item
@ -174,7 +182,6 @@ def checkCompanyName(filename,user_id):
if cishu > 3: if cishu > 3:
cishu = 0 cishu = 0
yield "文档公司或组织名称检查---结果生成中" + '.' * cishu yield "文档公司或组织名称检查---结果生成中" + '.' * cishu
userLog.info(f"checkCompanyName----结果生成中" + '.' * cishu)
cishu += 1 cishu += 1
data = runList[len(runList) - 1][0]["content"] data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', '')) parsed_data = json_repair.loads(data.replace('`', ''))
@ -182,14 +189,15 @@ def checkCompanyName(filename,user_id):
for place in parsed_data: for place in parsed_data:
try: try:
if place['回答'] == '非泛化的公司或组织名称': if place['回答'] == '具体的公司或组织名称':
if (place["companyName"] == "北京国研科技咨询有限公司浙江分公司"):
continue
error_places.append(place) error_places.append(place)
except Exception as e: except Exception as e:
userLog.warning(place) userLog.warning(place)
userLog.warning(e) userLog.warning(e)
userLog.warning("文档公司或组织名称检查---组织提出出错") userLog.warning("文档公司或组织名称检查---组织提出出错")
continue continue
userLog.info(error_places)
returnInfo = "发现异常公司或组织名称<br>" returnInfo = "发现异常公司或组织名称<br>"
if len(error_places) > 0: if len(error_places) > 0:
for t in error_places: for t in error_places:
@ -199,9 +207,9 @@ def checkCompanyName(filename,user_id):
t["yuanwen"] = paragraphs[0] t["yuanwen"] = paragraphs[0]
yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "") yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "")
returnInfo += "原文:" + yuanwen + "<br>异常公司或组织名称:**" + keyword + "**!请注意" + "<br>" returnInfo += "原文:" + yuanwen + "<br>异常公司或组织名称:**" + keyword + "**!请注意" + "<br>"
userLog.info(returnInfo) userLog.info("文档公司或组织名称检查---原文:" + yuanwen + "异常公司或组织名称:" + keyword + "!请注意")
yield returnInfo yield returnInfo
else: else:
yield "**未发现异常公司或组织名称**<br>" yield "**未发现异常公司或组织名称**<br>"
userLog.info("**未发现异常公司或组织名称**<br>") userLog.info("文档公司或组织名称检查---未发现异常公司或组织名称")
outLog.mark_done(user_id, "checkCompanyName") outLog.mark_done(user_id, "checkCompanyName")

86
checkDocumentError.py

@ -8,7 +8,7 @@ import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml from docx.opc.oxml import parse_xml
import requests import requests
from myLogger import outLog # from myLogger import outLog
import time import time
def load_from_xml_v2(baseURI, rels_item_xml): def load_from_xml_v2(baseURI, rels_item_xml):
""" """
@ -27,9 +27,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2 _SerializedRelationships.load_from_xml = load_from_xml_v2
import logging # import logging
outLog.logger = logging.getLogger("checkDocumentError") # outLog.logger = logging.getLogger("checkDocumentError")
userLog=None userLog=None
llm_cfg = { llm_cfg = {
# 'model': 'qwen1.5-72b-chat', # 'model': 'qwen1.5-72b-chat',
@ -40,7 +40,7 @@ llm_cfg = {
bot = Assistant(llm=llm_cfg, bot = Assistant(llm=llm_cfg,
name='Assistant', name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' # description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
system_message="你是一个错别字分析大师"
) )
# prompt=''' # prompt='''
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案, # 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案,
@ -48,25 +48,25 @@ bot = Assistant(llm=llm_cfg,
# ''' # '''
prompt = ''' prompt = '''
请回答以上问题[]选项中选择答案,原文内容标点符号保持不变如果有错请给出详细的解析没有错则不用给解析 请回答以上问题[]选项中选择答案,原文内容标点符号保持不变如果有错请给出详细的解析没有错则不用给解析
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}]不做过多的解释,严格按回答格式作答; 回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}]不做过多的解释,严格按回答格式作答;
''' '''
def getDocxToTextAll(name): def getDocxToTextAll(name):
userLog.info("checkDocumentError----打开文档")
docxPath = name docxPath = name
loopCount = 0 loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath) document = Document(docxPath)
break # while True:
except Exception as e: # loopCount+=1
time.sleep(1) # if(loopCount>=60):
pass # raise Exception("文档读取超时,或文档存在问题无法读取")
# break
# try:
# document = Document(docxPath)
# break
# except Exception as e:
# time.sleep(1)
# pass
# 逐段读取docx文档的内容 # 逐段读取docx文档的内容
words = [] words = []
for paragraph in document.paragraphs: for paragraph in document.paragraphs:
@ -84,23 +84,21 @@ def getDocxToTextAll(name):
txt_file.write(text) txt_file.write(text)
def checkDocumentError(filename,user_id): def checkDocumentError(filename,user_id,outLog):
global userLog global userLog
userLog=outLog.get_queue(user_id,"checkDocumentError") userLog=outLog.get_queue(user_id,"checkDocumentError")
yield f"文档纠错---开始处理文档..." yield f"文档纠错---开始处理文档..."
userLog.info("checkDocumentError----开始处理文档...")
try: try:
getDocxToTextAll(filename) getDocxToTextAll(filename)
except Exception as e: except Exception as e:
userLog.warning(e) userLog.warning(e)
userLog.warning("文档纠错----文档无法打开,请检查文档内容") userLog.warning("文档纠错----文档无法打开,请检查文档内容")
yield "文档纠错----文档无法打开,请检查文档内容" yield "文档纠错----文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
outLog.mark_done(user_id, "checkDocumentError") outLog.mark_done(user_id, "checkDocumentError")
return return
with open("checkDocumentError.txt", "r", encoding='utf-8') as f: with open("checkDocumentError.txt", "r", encoding='utf-8') as f:
gettext = f.read() gettext = f.read()
yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送 yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送
userLog.info("checkDocumentError----开始解析文档...")
final_list = [] final_list = []
for item in documentErrorTask(gettext): for item in documentErrorTask(gettext):
if isinstance(item, str): if isinstance(item, str):
@ -113,12 +111,11 @@ def checkDocumentError(filename,user_id):
yuanwen = i["placeName"].replace("\n", "") yuanwen = i["placeName"].replace("\n", "")
jianyi = i["jianyi"].replace("\n", "") jianyi = i["jianyi"].replace("\n", "")
resInfo += "原文:" + yuanwen + "<br>建议:**" + jianyi + "**<br>" resInfo += "原文:" + yuanwen + "<br>建议:**" + jianyi + "**<br>"
userLog.info(resInfo)
yield resInfo yield resInfo
else: else:
yield "**未发现错别字**" yield "**未发现错别字**"
userLog.info("未发现错别字") userLog.info("文档纠错---未发现错别字")
outLog.mark_done(user_id,"checkDocumentError") outLog.mark_done(user_id,"checkDocumentError")
@ -129,27 +126,33 @@ def documentErrorTask(text):
:param batch_size: 每批处理的字符数 :param batch_size: 每批处理的字符数
:return: 生成器每次返回一批文本 :return: 生成器每次返回一批文本
""" """
yield "文档纠错---启动中...." yield "文档纠错---文档解析中...."
userLog.info("checkDocumentError----启动中....") userLog.info("文档纠错---任务开始")
batchNum = 20 batchNum = 20
sentences = re.split(r'[。\n]', text) sentences = re.split(r'[。\n]', text)
# 去掉空字符 # 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数 # 计算总字符数
total_chars = len(sentences) total_chars = len(sentences)
# 计算有多少份 # 计算有多少份
num_chunks = math.ceil(total_chars / batchNum) num_chunks = math.ceil(total_chars / batchNum)
# 按batchNum字为一份进行处理 # 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
# 打印每一份的内容 # 打印每一份的内容
err = [] err = []
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}" yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"
userLog.info(f"checkDocumentError----文档解析进度:{i + 1}/{num_chunks}")
try: try:
url = "http://0.0.0.0:8190/taskflow/checkDocumentError" # url = "http://0.0.0.0:8190/taskflow/checkDocumentError"
# headers = {"Content-Type": "application/json"}
# data = {
# "data": {
# "text": chunk,
# }
# }
# r = requests.post(url=url, headers=headers, data=json.dumps(data))
# res = json.loads(r.text)
url = "http://127.0.0.1:5001/taskflow/checkDocumentError"
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
data = { data = {
"data": { "data": {
@ -158,12 +161,13 @@ def documentErrorTask(text):
} }
r = requests.post(url=url, headers=headers, data=json.dumps(data)) r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text) res = json.loads(r.text)
# print(res)
except Exception as e: except Exception as e:
userLog.warning(chunk) userLog.warning(chunk)
userLog.warning("文档纠错--错别字识别出错\n", e) userLog.warning("文档纠错--错别字识别出错\n")
userLog.warning(e)
continue continue
lines_with_greeting = [place for place in res["result"] if len(place['errors']) > 0] lines_with_greeting = [place for place in res["data"] if len(place['errors']) > 0]
userLog.debug(lines_with_greeting)
if len(lines_with_greeting) > 0: if len(lines_with_greeting) > 0:
num = 0 num = 0
wenti = [] # 记录问题的数组 wenti = [] # 记录问题的数组
@ -173,26 +177,28 @@ def documentErrorTask(text):
keyword = t['source'] keyword = t['source']
keyword_list.append(keyword) keyword_list.append(keyword)
for item in t["errors"]: for item in t["errors"]:
for key, value in item['correction'].items(): # for key, value in item['correction'].items():
temp_errorWords.append(key) # temp_errorWords.append(key)
temp_errorWords.append(item[0])
wenti.append( wenti.append(
"序号:{},原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords))) # "{}:原文是{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
"{}:原文是{}。问题:当前原文是否存在错别字,只检查错被子,其他不做分析".format(num, keyword))
num += 1 num += 1
words = "\n".join(wenti) words = "\n".join(wenti)
userLog.debug(words)
messages = [{'role': 'user', 'content': [{'text': words + prompt}]}] messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]
runList = [] runList = []
yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送 yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送
userLog.info(f"checkDocumentError----内容解析中...")
cishu = 0 cishu = 0
for rsp in bot.run(messages): for rsp in bot.run(messages):
runList.append(rsp) runList.append(rsp)
if cishu > 3: if cishu > 3:
cishu = 0 cishu = 0
yield "文档纠错---内容解析中" + '.' * cishu yield "文档纠错---内容解析中" + '.' * cishu
userLog.info(f"checkDocumentError----内容解析中内容解析中" + '.' * cishu)
cishu += 1 cishu += 1
data = runList[len(runList) - 1][0]["content"] data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace("\\", "").replace('`', '')) parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))
userLog.debug(parsed_data)
resListerr = [] resListerr = []
for place in parsed_data: for place in parsed_data:
try: try:
@ -200,14 +206,16 @@ def documentErrorTask(text):
place["placeName"] = keyword_list[int(place["placeName"])] place["placeName"] = keyword_list[int(place["placeName"])]
place["jianyi"] = place["解析"] place["jianyi"] = place["解析"]
resListerr.append(place) resListerr.append(place)
userLog.info("文档纠错---原文:" + place["placeName"] + "<br>建议:" + place["jianyi"])
except Exception as e: except Exception as e:
userLog.warning(parsed_data) userLog.warning(parsed_data)
userLog.warning(place) userLog.warning(place)
userLog.warning("文档纠错--错别字提取出错\n", e) userLog.warning("文档纠错--错别字提取出错\n")
userLog.warning(e)
continue continue
if (len(resListerr) > 0): if (len(resListerr) > 0):
err.extend(resListerr) err.extend(resListerr)
# 打印总份数 # 打印总份数
yield "文档地名检查---文档解析完成" yield "文档纠错---文档解析完成"
userLog.info(err) userLog.info("文档纠错---任务结束")
yield err yield err

11
checkPlaceName.py

@ -87,7 +87,6 @@ def getDocxToTextAll(docxPath):
#得到全文和地名有关的内容 #得到全文和地名有关的内容
def placeNameTask(text): def placeNameTask(text):
yield "文档地名检查---启动中...." yield "文档地名检查---启动中...."
userLog.info("checkPlaceName----启动中....")
batchNum=20 batchNum=20
sentences = re.split(r'[。\n]', text) sentences = re.split(r'[。\n]', text)
# 去掉空字符 # 去掉空字符
@ -104,7 +103,6 @@ def placeNameTask(text):
# 打印每一份的内容 # 打印每一份的内容
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}" yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
userLog.info(f"checkPlaceName----文档解析进度:{i + 1}/{num_chunks}")
wenBen=".".join(chunk) wenBen=".".join(chunk)
try: try:
url = "http://0.0.0.0:8191/taskflow/checkPlaceName" url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
@ -139,7 +137,6 @@ def placeNameTask(text):
isplace = False isplace = False
# 打印总份数 # 打印总份数
yield "文档地名检查---文档解析完成" yield "文档地名检查---文档解析完成"
userLog.info("checkPlaceName---文档解析完成")
placeList=list(dict.fromkeys(placeList)) placeList=list(dict.fromkeys(placeList))
yield placeList yield placeList
@ -175,7 +172,6 @@ def checkPlaceName(filename,user_id):
if cishu>3: if cishu>3:
cishu=0 cishu=0
yield "文档地名检查---结果生成中"+'.'*cishu yield "文档地名检查---结果生成中"+'.'*cishu
userLog.info("checkPlaceName---结果生成中"+'.'*cishu)
cishu+=1 cishu+=1
data = runList[len(runList) - 1][0]["content"] data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', '')) parsed_data = json_repair.loads(data.replace('`', ''))
@ -186,12 +182,11 @@ def checkPlaceName(filename,user_id):
if place['回答'] == '错误': if place['回答'] == '错误':
error_places.append(place) error_places.append(place)
except Exception as e: except Exception as e:
userLog.warning(parsed_data)
userLog.warning(place) userLog.warning(place)
userLog.warning(parsed_data)
userLog.warning("文档地名检查---组织提出出错") userLog.warning("文档地名检查---组织提出出错")
userLog.warning(e) userLog.warning(e)
continue continue
userLog.info(error_places)
returnInfo = "发现异常地名<br>" returnInfo = "发现异常地名<br>"
if len(error_places)>0: if len(error_places)>0:
for t in error_places: for t in error_places:
@ -200,9 +195,9 @@ def checkPlaceName(filename,user_id):
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","") yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
returnInfo+="原文:" + yuanwen + "<br>出现异常地名:**" + keyword + "**!请注意" + "<br>" returnInfo+="原文:" + yuanwen + "<br>出现异常地名:**" + keyword + "**!请注意" + "<br>"
userLog.info(returnInfo) userLog.info("文档地名检查---原文:" + yuanwen + "出现异常地名:" + keyword + "!请注意")
yield returnInfo yield returnInfo
else: else:
yield "**未发现发现异常地名**" yield "**未发现发现异常地名**"
userLog.info("未发现发现异常地名") userLog.info("文档地名检查---未发现发现异常地名")
outLog.mark_done(user_id, "checkPlaceName") outLog.mark_done(user_id, "checkPlaceName")

180
checkRepeatText.py

@ -7,6 +7,7 @@ from qwen_agent.agents import Assistant
import json_repair import json_repair
import json import json
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
# embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese",model_kwargs={"device":"npu:5"})
device_id=0 device_id=0
import re import re
import time import time
@ -17,9 +18,9 @@ from docx.opc.oxml import parse_xml
import logging import logging
import logging.config import logging.config
import requests import requests
from myLogger import outLog # from myLogger import outLog
outLog.logger = logging.getLogger("checkRepeatText") # outLog.logger = logging.getLogger("checkRepeatText")
userLog=None userLog=None
def load_from_xml_v2(baseURI, rels_item_xml): def load_from_xml_v2(baseURI, rels_item_xml):
""" """
@ -79,11 +80,10 @@ def isTitle(paragraph):
#寻找标题名称 #寻找标题名称
def findTitleName(docxPath): def findTitleName(docxPath):
yield '文档相似性检查----检查是否存在详细设计方案'
loopCount = 0 loopCount = 0
while True: while True:
loopCount+=1 loopCount+=1
if(loopCount>=15): if(loopCount>=60):
raise Exception("文档读取超时,或文档存在问题无法读取") raise Exception("文档读取超时,或文档存在问题无法读取")
break break
try: try:
@ -95,9 +95,19 @@ def findTitleName(docxPath):
# 逐段读取docx文档的内容 # 逐段读取docx文档的内容
titleWords=[] titleWords=[]
firstTitle = 0 firstTitle = 0
firstTitleName=""
secondTitle = 0 secondTitle = 0
sanjiTitle = 0 sanjiTitle = 0
levelText=""
count = 0
numid =0
wordContent={}
total = len(document.paragraphs)
addStart = False#是否重新添加
yield "文档相似性检查----文档内容解析中",str(count),str(total)
for paragraph in document.paragraphs: for paragraph in document.paragraphs:
count+=1
yield "文档相似性检查----文档内容解析中",str(count),str(total)
# 判断该段落的标题级别 # 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法 # 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text text = paragraph.text
@ -109,6 +119,8 @@ def findTitleName(docxPath):
if(text.find("附件")>=0): if(text.find("附件")>=0):
continue continue
titleWords.append("一级标题:".format(firstTitle)+text) titleWords.append("一级标题:".format(firstTitle)+text)
addStart=True
firstTitleName=text
elif level=="1": elif level=="1":
secondTitle+=1 secondTitle+=1
sanjiTitle=0 sanjiTitle=0
@ -118,15 +130,28 @@ def findTitleName(docxPath):
sanjiTitle += 1 sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) # titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
##先判断是不是一级标题
if addStart:
wordContent[firstTitleName]=[]
addStart=False
if level:
levelText=f"{int(level)+1}级标题-"+text
else:
if(text.startswith("") or text.startswith("注:")):
continue
if (len(text)>30 and firstTitleName):
numid+=1
wordContent[firstTitleName].append("{}".format(levelText)+text)
findTitleName_llm_cfg = { findTitleName_llm_cfg = {
#'model': 'qwen1.5-72b-chat', #'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b", 'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
} }
yield '文档相似性检查----检查是否存在详细设计方案'
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
name='Assistant', name='Assistant',
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' system_message='按照要求选择最合适的,是唯一的'
) )
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
类似详细设计方案,详细服务方案详细建设方案为最相关的优先选择 类似详细设计方案,详细服务方案详细建设方案为最相关的优先选择
@ -142,60 +167,78 @@ def findTitleName(docxPath):
runList.append(rsp) runList.append(rsp)
data = runList[len(runList) - 1][0]["content"] data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', '')) parsed_data = json_repair.loads(data.replace('`', ''))
try:
if(parsed_data["answer"]=="存在"): if(parsed_data["answer"]=="存在"):
yield parsed_data["name"] yield parsed_data["name"],wordContent
else: else:
yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较" yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
#获取文档中 详细设计方案 章节的所有内容
def getDocxToText(docxPath,titleName,vector_store_path):
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e: except Exception as e:
time.sleep(1) userLog.warning(e)
pass userLog.warning(data)
# 逐段读取docx文档的内容 userLog.warning(parsed_data)
levelList=[] yield "文档相似性检查----检查遇到问题,请联系管理员"
#获取文档中 详细设计方案 章节的所有内容
# def getDocxToText(docxPath,titleName,vector_store_path):
def getDocxToText(titleName,wordContent,vector_store_path):
# loopCount = 0
# while True:
# loopCount+=1
# if(loopCount>=15):
# raise Exception("文档读取超时,或文档存在问题无法读取")
# break
# try:
# document = Document(docxPath)
# break
# except Exception as e:
# time.sleep(1)
# pass
# # 逐段读取docx文档的内容
# levelList=[]
words=[] words=[]
addStart = False # addStart = False
levelText="" # levelText=""
i = 0 # i = 0
for paragraph in document.paragraphs: # count = 0
# 判断该段落的标题级别 # total = len(document.paragraphs)
# 这里用isTitle()临时代表,具体见下文介绍的方法 # yield "文档相似性检查----文档内容解析中",count,total
text = paragraph.text # for paragraph in document.paragraphs:
if text.strip():#非空判断 # count+=1
if titleName: # yield "文档相似性检查----文档内容解析中",count,total
level = isTitle(paragraph) # # 判断该段落的标题级别
if(addStart and level=="0"): # # 这里用isTitle()临时代表,具体见下文介绍的方法
addStart=False # text = paragraph.text
if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)): # if text.strip():#非空判断
addStart=True # if titleName:
if level: # level = isTitle(paragraph)
levelList.append("{}".format(level)+paragraph.text) # if(addStart and level=="0"):
levelText=f"{int(level)+1}级标题-"+text # addStart=False
else: # if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)):
if addStart: # addStart=True
if(text.startswith("") or text.startswith("注:")): # if level:
continue # levelList.append("{}:".format(level)+paragraph.text)
if(len(text)>30): # levelText=f"{int(level)+1}级标题-"+text
i=i+1 # else:
words.append("{}".format(levelText)+text) # if addStart:
# if(text.startswith("图") or text.startswith("注:")):
# continue
# if(len(text)>30):
# i=i+1
# words.append("{}:".format(levelText)+text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔 # 将所有段落文本拼接成一个字符串,并用换行符分隔
# 遍历字典,查找包含 "标题的" 的键
for key, value in wordContent.items():
if (titleName.find(key)>=0 or key.find(titleName)>=0):
words.extend(value) # 将对应的值添加
if len(words)==0: if len(words)==0:
raise Exception("checkRepeatText,获取长度为0") raise Exception("checkRepeatText,获取长度为0")
text = '\n'.join(words) text = '\n'.join(words)
userLog.info(f"文档相似性检查----需要处理的总数是{len(words)}")
# 将文本写入txt文件 # 将文本写入txt文件
with open("checkRepeatText.txt", 'w', ) as txt_file: with open("checkRepeatText.txt", 'w', ) as txt_file:
txt_file.write(text) txt_file.write(text)
time.sleep(3) time.sleep(1)
yield "文档相似性检查----文档内容转换中",".","."
loader = TextLoader(file_path='checkRepeatText.txt') loader = TextLoader(file_path='checkRepeatText.txt')
docs = loader.load() docs = loader.load()
# print(docs) # print(docs)
@ -204,34 +247,46 @@ def getDocxToText(docxPath,titleName,vector_store_path):
splits = text_splitter.split_documents(docs) splits = text_splitter.split_documents(docs)
uuids = [] uuids = []
yield "文档相似性检查----文档保存中",".","."
global embeddings
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
for i in range(len(splits)): for i in range(len(splits)):
uuids.append(str(uuid.uuid4())) uuidStr=str(uuid.uuid4())
uuids.append(uuidStr)
logging.info(f"checkRepeatTextuuidLen{len(uuids)}") logging.info(f"checkRepeatTextuuidLen{len(uuids)}")
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
vectorstore.add_documents(documents=splits, ids=uuids) vectorstore.add_documents(documents=splits, ids=uuids)
yield "文档相似性检查----校验文档是否已经完成保存",".","."
while True: while True:
time.sleep(0.3) time.sleep(0.3)
ress = vectorstore.similarity_search(words[0]) ress = vectorstore.similarity_search(words[0])
if (len(ress) > 0): if (len(ress) > 0):
break break
return words,uuids,vectorstore yield words,uuids,vectorstore
# @app.route('/checkRepeatText/<filename>', methods=['GET']) # @app.route('/checkRepeatText/<filename>', methods=['GET'])
def checkRepeatText(filename,user_id): def checkRepeatText(filename,user_id,outLog):
global userLog global userLog
userLog=outLog.get_queue(user_id,"checkRepeatText") userLog=outLog.get_queue(user_id,"checkRepeatText")
yield "文档相似性检查---启动中...." yield "文档相似性检查---启动中...."
userLog.info("文档相似性检查---任务开始")
vector_store_path="vector_store"+str(uuid.uuid4()) vector_store_path="vector_store"+str(uuid.uuid4())
for titleName in findTitleName(filename): for titleName in findTitleName(filename):
if(isinstance(titleName ,tuple)):
if(len(titleName)==3):
yield titleName[0]+titleName[1]+"/"+titleName[2]
else:
yield titleName yield titleName
if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"): if(isinstance(titleName ,tuple)):
# try:
yield "文档相似性检查----文档内容转换中"
try: try:
yield "文档相似性检查----文档内容解析中" for words,uuids,vectorstore in getDocxToText(titleName[0],titleName[1],vector_store_path):
words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path) if isinstance(words, str):
yield words+uuids+vectorstore
except Exception as e: except Exception as e:
yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败" yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
userLog.warning(e) userLog.warning(e)
userLog.warning(f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败") userLog.warning(f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败")
outLog.mark_done(user_id, "checkRepeatText") outLog.mark_done(user_id, "checkRepeatText")
@ -241,7 +296,7 @@ def checkRepeatText(filename,user_id):
count = 0 count = 0
for i in words: for i in words:
count += 1 count += 1
yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}" yield f"文档相似性检查--对{titleName[0]}章节,进行文档内容检查中{count}/{len(words)}"
result = vectorstore.similarity_search(i) result = vectorstore.similarity_search(i)
textTag = i.split("")[0] textTag = i.split("")[0]
for content in result: for content in result:
@ -259,6 +314,7 @@ def checkRepeatText(filename,user_id):
} }
r = requests.post(url=url, headers=headers, data=json.dumps(data)) r = requests.post(url=url, headers=headers, data=json.dumps(data))
res = json.loads(r.text) res = json.loads(r.text)
res=res["data"]
# res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) # res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]])
except Exception as e: except Exception as e:
userLog.warning("文档相似性检查--发生异常:") userLog.warning("文档相似性检查--发生异常:")
@ -266,7 +322,7 @@ def checkRepeatText(filename,user_id):
userLog.warning(i) userLog.warning(i)
userLog.warning(text) userLog.warning(text)
continue continue
if (res["result"][0]["similarity"] > 0.90): if (res[0]["similarity"] >= 0.96):
# 判断重复内容是否被放入 # 判断重复内容是否被放入
if (len(reslist) > 0): if (len(reslist) > 0):
isExist = False isExist = False
@ -276,15 +332,15 @@ def checkRepeatText(filename,user_id):
break break
if not isExist: if not isExist:
# reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]}) # reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]})
userLog.info("【在"+i[:i.find('')].replace("\n","")+"下包含:"+i[i.find('') + 1:].replace("\n","")+"<br>在"+text[:text.find('')].replace("\n","")+"**下包含:"+text[text.find('') + 1:].replace("\n","")+"<br>以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"") userLog.info("【在"+i[:i.find('')].replace("\n","")+"下包含:"+i[i.find('') + 1:].replace("\n","")+"<br>在"+text[:text.find('')].replace("\n","")+"**下包含:"+text[text.find('') + 1:].replace("\n","")+"<br>以上两段内容相似度:"+'{:.2f}'.format(res[0]["similarity"])+"")
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]}) reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
else: else:
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]}) reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
# print(i.split(":")[1] + "\n" + text.split(":")[1]) # print(i.split(":")[1] + "\n" + text.split(":")[1])
userLog.info("【在"+i[:i.find('')].replace("\n","")+"下包含:"+i[i.find('') + 1:].replace("\n","")+"<br>在"+text[:text.find('')].replace("\n","")+"**下包含:"+text[text.find('') + 1:].replace("\n","")+"<br>以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"") userLog.info("【在"+i[:i.find('')].replace("\n","")+"下包含:"+i[i.find('') + 1:].replace("\n","")+"<br>在"+text[:text.find('')].replace("\n","")+"**下包含:"+text[text.find('') + 1:].replace("\n","")+"<br>以上两段内容相似度:"+'{:.2f}'.format(res[0]["similarity"])+"")
# vectorstore.delete(ids=uuids) # vectorstore.delete(ids=uuids)
shutil.rmtree(vector_store_path) shutil.rmtree(vector_store_path)
resInfo=f"{titleName}章节,发现相似内容:<br>" resInfo=f"{titleName[0]}章节,发现相似内容:<br>"
if(len(reslist)>0): if(len(reslist)>0):
for res in reslist: for res in reslist:
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find('')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find('') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find('')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find('') + 1:]+"<br>以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】<br>" resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find('')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find('') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find('')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find('') + 1:]+"<br>以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】<br>"

83
checkTitleName.py

@ -8,7 +8,9 @@ import json_repair
import math import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml from docx.opc.oxml import parse_xml
from myLogger import outLog
# from myLogger import outLog
def load_from_xml_v2(baseURI, rels_item_xml): def load_from_xml_v2(baseURI, rels_item_xml):
""" """
@ -29,11 +31,11 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2 _SerializedRelationships.load_from_xml = load_from_xml_v2
import logging import logging
outLog.logger = logging.getLogger("checkTitleName") # outLog.logger = logging.getLogger("checkTitleName")
userLog=None userLog = None
llm_cfg = { llm_cfg = {
#'model': 'qwen1.5-72b-chat', # 'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct", 'model': "qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base 'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
} }
@ -81,12 +83,13 @@ def isTitle(paragraph):
# 如果在段落、样式里都没有找到大纲级别,返回None # 如果在段落、样式里都没有找到大纲级别,返回None
return None return None
#获取文档中 详细设计方案 章节的所有内容
# 获取文档中 详细设计方案 章节的所有内容
def getDocxToTitleName(docxPath): def getDocxToTitleName(docxPath):
loopCount = 0 loopCount = 0
while True: while True:
loopCount+=1 loopCount += 1
if(loopCount>=15): if (loopCount >= 60):
raise Exception("文档读取超时,或文档存在问题无法读取") raise Exception("文档读取超时,或文档存在问题无法读取")
break break
try: try:
@ -96,64 +99,72 @@ def getDocxToTitleName(docxPath):
time.sleep(1) time.sleep(1)
pass pass
# 逐段读取docx文档的内容 # 逐段读取docx文档的内容
levelList=[] levelList = []
words=[] words = []
addStart = False addStart = False
levelText="" levelText = ""
i = 0 count = 0
total = len(document.paragraphs)
yield f"文档结构检查----文档内容解析中{str(count)}/{str(total)}"
for paragraph in document.paragraphs: for paragraph in document.paragraphs:
count += 1
yield f"文档结构检查----文档内容解析中{str(count)}/{str(total)}"
# 判断该段落的标题级别 # 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法 # 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text text = paragraph.text
if text.strip():#非空判断 if text.strip(): # 非空判断
level = isTitle(paragraph) level = isTitle(paragraph)
if level=="0": if level == "0":
words.append(text) words.append(text)
return words yield words
def checkTitleName(filename,user_id):
def checkTitleName(filename, user_id, outLog):
global userLog global userLog
userLog=outLog.get_queue(user_id,"checkTitleName") userLog = outLog.get_queue(user_id, "checkTitleName")
yield '文档结构检查----启动中' yield '文档结构检查----启动中'
userLog.info("checkTitleName----启动中") userLog.info("文档结构检查---任务开始")
with open("ce模板.txt", "r",encoding='utf-8') as f: with open("ce模板.txt", "r", encoding='utf-8') as f:
gettext = f.readlines() gettext = f.readlines()
count=0 count = 0
reserr = [] reserr = []
try: try:
word = getDocxToTitleName(filename) for i in getDocxToTitleName(filename):
word = i
if (isinstance(word, str)):
yield word
continue
except Exception as e: except Exception as e:
userLog.warning(e) userLog.warning(e)
yield "文档结构检查----文档无法打开,请检查文档内容" yield "文档结构检查----文件无法正常打开。可以尝试用WORD或WPS打开文件,进行修复并另存,用另存的文件再做一次尝试。"
outLog.mark_done(user_id, "checkTitleName")
userLog.warning("checkTitleName----文档无法打开,请检查文档内容") userLog.warning("checkTitleName----文档无法打开,请检查文档内容")
outLog.mark_done(user_id, "checkTitleName")
return return
for text in gettext: for text in gettext:
count+=1 count += 1
prompt = f''' prompt = f'''
\n 这些是文章的标题请问{text}在标题中是否可以配对的若有请指出是哪个标题若没有请回到不存在 \n 这些是文章的标题请问{text}在标题中是否可以配对的若有请指出是哪个标题若没有请回到不存在
''' '''
xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
yield f"文档结构检查----结构分析中{count}/{len(gettext)}" yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
userLog.info(f"checkTitleName----结构分析中{count}/{len(gettext)}") strword = "\n".join(word) + prompt + xushang
strword = "\n".join(word)+prompt+xushang messages = [{'role': 'user', 'content': [{'text': strword}]}]
messages = [{'role': 'user', 'content': [{'text':strword}]}]
runList = [] runList = []
for rsp in bot.run(messages): for rsp in bot.run(messages):
runList.append(rsp) runList.append(rsp)
# print(rsp) # print(rsp)
data = runList[len(runList) - 1][0]["content"] data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', '')) parsed_data = json_repair.loads(data.replace('`', ''))
if(parsed_data["answer"]=="不存在"): if (parsed_data["answer"] == "不存在"):
reserr.append(text) reserr.append(text)
userLog.info("文档结构检查----文档结构存在异常:" + text.replace('\n', ''))
resInfo="文档结构存在异常:<br>" resInfo = "文档结构存在异常:<br>"
if(len(reserr)>0): if (len(reserr) > 0):
for i in reserr: for i in reserr:
resInfo+="**"+i.replace('\n','')+"**<br>" resInfo += "**" + i.replace('\n', '') + "**<br>"
userLog.info(resInfo)
yield resInfo yield resInfo
else: else:
yield "文档结构未发现异常" yield "**文档结构未发现异常**"
userLog.info("文档结构未发现异常") userLog.info("文档结构检查----文档结构未发现异常")
outLog.mark_done(user_id, "checkTitleName") outLog.mark_done(user_id, "checkTitleName")

485
daijian方案.py

@ -1,11 +1,24 @@
from docx import Document import uuid
from pprint import pprint from langchain_community.embeddings import DashScopeEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qwen_agent.agents import Assistant from qwen_agent.agents import Assistant
import re
import json_repair import json_repair
import math import json
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
device_id=0
import re
import time
from docx import Document
import shutil
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml from docx.opc.oxml import parse_xml
import logging
import logging.config
import requests
from collections import defaultdict
userLog=None
def load_from_xml_v2(baseURI, rels_item_xml): def load_from_xml_v2(baseURI, rels_item_xml):
""" """
Return |_SerializedRelationships| instance loaded with the Return |_SerializedRelationships| instance loaded with the
@ -23,17 +36,6 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2 _SerializedRelationships.load_from_xml = load_from_xml_v2
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
)
# 记录程序开始的时间戳 # 记录程序开始的时间戳
def getOutlineLevel(inputXml): def getOutlineLevel(inputXml):
""" """
@ -73,15 +75,26 @@ def isTitle(paragraph):
# 如果在段落、样式里都没有找到大纲级别,返回None # 如果在段落、样式里都没有找到大纲级别,返回None
return None return None
#获取文档中 详细设计方案 章节的所有内容 #寻找标题名称
def getDocxToTitleName(docxPath): def findTitleName(docxPath):
yield '文档相似性检查----检查是否存在详细设计方案'
loopCount = 0
while True:
loopCount+=1
if(loopCount>=15):
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath) document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容 # 逐段读取docx文档的内容
levelList=[] titleWords=[]
words=[] firstTitle = 0
addStart = False secondTitle = 0
levelText="" sanjiTitle = 0
i = 0
for paragraph in document.paragraphs: for paragraph in document.paragraphs:
# 判断该段落的标题级别 # 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法 # 这里用isTitle()临时代表,具体见下文介绍的方法
@ -89,88 +102,360 @@ def getDocxToTitleName(docxPath):
if text.strip():#非空判断 if text.strip():#非空判断
level = isTitle(paragraph) level = isTitle(paragraph)
if level=="0": if level=="0":
words.append(text) firstTitle+=1
return words secondTitle = 0
if(text.find("附件")>=0):
def checkTitleName(filename): continue
prompt = f''' titleWords.append("一级标题:".format(firstTitle)+text)
\n 这些是文章的标题请问{text}在标题中是否可以配对的若有请指出是哪个标题若没有请回到不存在 elif level=="1":
secondTitle+=1
sanjiTitle=0
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
elif level=="2":
sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
findTitleName_llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
name='Assistant',
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
)
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
类似详细设计方案,详细服务方案详细建设方案为最相关的优先选择
类似设计方案服务方案建设方案为次相关次级选择
类似方案是最后选择
按照这样的顺序选择最合适的
你只能从这两个答案中选择一个{"name":"一级标题名称","answer":"存在"}{"name":"","answer":"不存在"}不做过多的解释,严格按回答格式作答
''' '''
xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" # print("\n".join(titleWords)+prompt)
yield f"文档结构检查----结构分析中{count}/{len(gettext)}" messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
strword = "\n".join(word) + prompt + xushang runList=[]
# print(strword) for rsp in findTitleName_bot.run(messages):
messages = [{'role': 'user', 'content': [{'text': strword}]}]
runList = []
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp) runList.append(rsp)
# print(rsp)
data = runList[len(runList) - 1][0]["content"] data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', '')) parsed_data = json_repair.loads(data.replace('`', ''))
print(parsed_data) if(parsed_data["answer"]=="存在"):
# yield '文档结构检查----启动中' yield parsed_data["name"]
# with open("ce模板.txt", "r",encoding='utf-8') as f: else:
# gettext = f.readlines() yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
# count=0
# reserr = []
# try:
# word = getDocxToTitleName(filename)
# except Exception as e:
# print(e)
# yield "文档无法打开,请检查文档内容"
# return
# for text in gettext:
# count+=1
# prompt = f'''
# \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在
# '''
# xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
# yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
# strword = "\n".join(word)+prompt+xushang
# # print(strword)
# messages = [{'role': 'user', 'content': [{'text':strword}]}]
# runList = []
# cishu = 0
# for rsp in bot.run(messages):
# runList.append(rsp)
# # print(rsp)
# data = runList[len(runList) - 1][0]["content"]
# parsed_data = json_repair.loads(data.replace('`', ''))
# print(parsed_data)
# if(parsed_data["answer"]=="不存在"):
# reserr.append(text)
# resInfo="文档结构存在异常:<br>"
# if(len(reserr)>0):
# for i in reserr:
# resInfo+=f"**{i}**<br>"
# yield resInfo
# else:
# yield "文档结构未发现异常"
def merge_chapters(words):
merged_text = {}
for line in words:
if "" in line:
key, value = line.split("", 1) # 根据第一个冒号分割
if key in merged_text:
merged_text[key].append(value.strip()) # 添加到列表
else:
merged_text[key] = [value.strip()] # 初始化列表
else:
logging.warning(f"Skipping line without key-value pair: {line}")
import logging # 合并结果格式化为列表输出
merged_words = []
for key, values in merged_text.items():
combined_value = "".join(values) # 将内容合并
merged_words.append(f"{key}{combined_value}")
return merged_words
#获取文档中 详细设计方案 章节的所有内容
def getDocxToText(docxPath, titleName, vector_store_path):
loopCount = 0
while True:
loopCount += 1
if loopCount >= 15:
raise Exception("文档读取超时,或文档存在问题无法读取")
break
try:
document = Document(docxPath)
break
except Exception as e:
time.sleep(1)
pass
# 逐段读取docx文档的内容
levelList = []
words = []
addStart = False
title_counter = [] # 用于存储当前标题的计数
title_texts = [] # 用于存储当前各级标题的文本
i = 0
for paragraph in document.paragraphs:
text = paragraph.text.strip()
if text: # 非空判断
level = isTitle(paragraph) # 确保这个函数在代码中定义
# 当前标题的层级
current_level = int(level) if level is not None else -1
if current_level >= 0: # 标题段落
# 确保标题计数器足够长
while len(title_counter) <= current_level:
title_counter.append(0) # 初始化新级别的标题计数
title_texts.append('') # 初始化对应的标题文本
# 更新当前级别及以下的标题计数和标题文本
title_counter[current_level] += 1 # 当前级别计数加1
title_counter = title_counter[:current_level+1]
title_texts[current_level] = text # 保存当前级别的标题文本
title_texts = title_texts[:current_level+1]
# 重置更低级别的计数和标题文本
for idx in range(current_level + 1, len(title_counter)):
title_counter[idx] = 0
title_texts[idx] = ''
# 检查是否与 titleName 匹配
if current_level == 0:
addStart = titleName in text # 检查是否与 titleName 匹配
else: # 非标题段落
if addStart:
if len(text) > 30: # 仅记录长度大于30的内容
i += 1
# 获取当前完整的标题编号和标题名称
levelText = ".".join(map(str, title_counter))
# 使用非空的标题名称
current_title = title_texts[-1] if title_texts else ''
words.append(f"{levelText}-{current_title}{text}")
if len(words) == 0:
raise Exception("checkRepeatText,获取长度为0")
# 使用封装的合并函数
merged_words = merge_chapters(words)
# 将合并后的内容写入 txt 文件
with open("checkRepeatText.txt", 'w') as txt_file:
for line in merged_words:
txt_file.write(f"{line}\n")
time.sleep(3)
# 加载文本
loader = TextLoader(file_path='checkRepeatText.txt')
docs = loader.load()
# 创建唯一标识符
uuids = []
for _ in range(len(merged_words)):
uuids.append(str(uuid.uuid4()))
logging.info(f"checkRepeatTextuuidLen{len(uuids)}")
return merged_words, uuids
# @app.route('/checkRepeatText/<filename>', methods=['GET'])
def checkRepeatText(filename):
yield "文档相似性检查---启动中...."
vector_store_path="vector_store"+str(uuid.uuid4())
for titleName in findTitleName(filename):
yield titleName
if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"):
yield "文档相似性检查----文档内容解析中"
words,uuids=getDocxToText(filename,titleName,vector_store_path)
# 记录程序开始的时间戳‘
reslist = []
count = 0
standard = {
"清晰性": """对软件功能描述的完整性主要体现在以下两个方面:
a. 功能描述是否简洁明了避免使用过于复杂或专业的术语使得用户能够轻松理解
b. 是否明确指出了功能的具体作用没有模糊不清或含糊其辞的表述
如果要将软件功能描述的清晰性划分为优秀良好一般差四个从高到低的等级每个等级的评判标准是什么
将软件功能描述的清晰性划分为优秀良好一般差四个等级时每个等级的评判标准可以如下定义
优秀90~100
简洁明了功能描述极其精炼没有多余的词汇每个字都承载着必要的信息
通俗易懂完全避免了专业术语或行业黑话即使是非专业用户也能轻松理解
具体明确功能的作用范围限制以及用户期望的结果都被清晰准确地阐述没有任何模糊或含糊的表述
良好70~90不包含90分
较为简洁功能描述相对简短但可能包含一些必要的细节或背景信息
易于理解大部分术语都是通俗易懂的对于少数专业术语提供了简短的解释或上下文
明确具体功能的主要作用范围和用户期望的结果都被明确阐述但可能在某些细节上稍显模糊
一般60~70不包含70分
稍显冗长功能描述可能包含一些不必要的细节或重复信息导致用户需要花费更多时间来理解
有一定难度使用了一些专业术语或行业黑话但没有提供足够的解释或上下文导致非专业用户可能难以理解
基本明确功能的主要作用被阐述但在范围限制或用户期望的结果上可能存在一些模糊或含糊的表述
60分以下不包含60分
冗长复杂功能描述过于详细和复杂包含大量不必要的细节和背景信息导致用户难以抓住重点
难以理解大量使用专业术语或行业黑话且没有提供任何解释或上下文使得大部分用户都难以理解
模糊不清功能的作用范围限制以及用户期望的结果都没有被明确阐述存在大量的模糊和含糊表述
评估的提示词举例
根据这些评判标准对下面的软件功能描述的清晰性进行客观的评价给出优秀良好一般差四个等级之一的评价并给出具体得分并在此基础上润色和完善使之达到优秀的等级
""",
"完整性": """对软件功能描述的完整性主要体现在以下两个方面:
a. 是否涵盖了功能的所有重要方面包括输入输出处理过程等
b. 是否提供了足够的信息以便用户能够全面了解功能的工作原理和用途
如果要将软件功能描述的完整性划分为优秀良好一般差四个从高到低的等级每个等级的评判标准是什么
将软件功能描述的完整性划分为优秀良好一般差四个等级时每个等级的评判标准可以如下定义
优秀90~100
描述全面涵盖了功能的所有重要方面包括但不限于输入输出处理过程异常处理等
提供了详尽的信息用户能够清晰地了解功能的工作原理用途以及在不同场景下的表现
包含了必要的示例图表或流程图以直观展示功能的工作流程和效果
没有遗漏任何对用户理解和使用功能至关重要的信息
良好70~90不包含90分
描述基本涵盖了功能的主要方面但可能有个别不太重要的细节未提及
提供了足够的信息用户能够较好地理解功能的工作原理和用途但在某些复杂场景下可能需要额外说明
可能包含一些示例或图表但可能不如优秀等级那么全面或详细
一般60~70不包含70分
描述涵盖了功能的一部分重要方面但存在较明显的遗漏或不足
提供的信息有限用户可能只能对功能有一个大致的了解无法深入了解其工作原理和详细用途
可能缺乏示例图表或流程图等辅助材料导致用户难以理解功能的某些复杂部分
60分以下不包含60分
描述严重缺失未涵盖功能的关键方面甚至可能误导用户
提供的信息极少用户无法全面了解功能的工作原理和用途
可能存在错误或矛盾的信息导致用户无法准确理解功能
根据这些评判标准对下面的软件功能描述的完整性进行客观的评价给出优秀良好一般差四个等级之一的评价并在此基础上润色和完善使之达到优秀的等级
""",
"可测试性": """软件功能描述的可测试性主要体现为以下方面:
a. 功能描述是否具体明确以便能够进行功能测试和验证
b. 是否提供了足够的细节以便开发人员和测试人员能够准确理解和实现功能
如果要将软件功能描述的可测试性划分为优秀良好一般差四个从高到低的等级每个等级的评判标准是什么
将软件功能描述的可测试性划分为优秀良好一般差四个等级时每个等级的评判标准可以如下定义
优秀90~100
功能描述非常具体和明确能够直接转化为测试用例
提供了详尽的细节包括输入输出边界条件异常处理等
开发人员和测试人员能够轻松理解和实现功能无需额外澄清或假设
功能描述中包含了预期的行为和非预期的行为有助于全面覆盖测试场景
良好70~90不包含90分
功能描述相对具体和明确大部分内容可以直接用于测试
提供了足够的细节但可能需要一些额外的解释或澄清才能完全理解
开发人员和测试人员能够基于描述实现和测试功能但可能需要一些额外的沟通和协调
功能描述中基本涵盖了主要的行为和边界条件但可能缺少对某些异常情况的详细描述
一般60~70不包含70分
功能描述较为笼统需要较多的解释和澄清才能用于测试和开发
细节不够充分可能导致开发人员和测试人员在实现和测试过程中产生误解或遗漏
需要较多的沟通和协调来确保功能的正确实现和测试
功能描述中可能只涵盖了主要的行为对边界条件和异常情况的描述较为模糊或缺失
60分以下不包含60分
功能描述非常模糊和笼统无法直接用于测试和开发
缺乏必要的细节导致开发人员和测试人员无法准确理解和实现功能
需要大量的沟通和协调甚至可能需要重新编写功能描述才能进行有效的测试和开发
功能描述中可能只提到了大致的目标或意图没有具体的行为描述边界条件或异常处理
根据这些评判标准对下面的软件功能描述的可测试性进行客观的评价给出优秀良好一般差四个等级之一的评价并在此基础上润色和完善使之达到优秀的等级
""",
"详细性": """软件功能详细性主要体现在:
a. 功能描述是否详细可以根据功能描述进行功能点评价计算出ILFEIFEIEOEQ的数量
如果要将软件功能描述的详细性划分为优秀良好一般差四个从高到低的等级每个等级的评判标准是什么
将软件功能描述的详细性划分为优秀良好一般差四个等级时每个等级的评判标准可以如下定义
优秀90~100
功能描述非常详尽包含了所有必要的信息使得评估者能够轻松地根据描述进行功能点评价
ILFEIFEIEOEQ的数量可以明确且无误地计算出来没有遗漏或模糊之处
描述中不仅包含了功能的正常操作还涵盖了异常处理边界条件等特殊情况
使用了具体的例子流程图或伪代码来进一步阐明功能
良好70~90不包含90分
功能描述相对详细提供了足够的信息来进行功能点评价
ILFEIFEIEOEQ的数量可以大致计算出来但可能需要一些额外的解释或澄清
描述中基本涵盖了功能的各个方面但对某些细节或特殊情况可能描述不够充分
整体而言描述是清晰和准确的但还有改进的空间
一般60~70不包含70分
功能描述较为笼统缺乏具体的细节
ILFEIFEIEOEQ的数量计算可能存在一定的困难或不确定性需要较多的假设或推测
描述中只涵盖了功能的主要方面对细节和特殊情况的处理描述不足
可能需要额外的沟通或澄清才能准确理解功能需求
60分以下不包含60分
功能描述非常模糊缺乏必要的信息和细节
无法根据描述进行准确的功能点评价ILFEIFEIEOEQ的数量无法确定
描述中可能只提到了功能的大致目标或意图没有具体的实现细节或操作步骤
需要大量的额外信息或澄清才能理解功能需求甚至可能需要重新编写功能描述
根据这些评判标准对下面的软件功能描述的详细性进行客观的评价给出优秀良好一般差四个等级之一的评价并在此基础上润色和完善使之达到优秀的等级
""",
}
weight = {
"清晰性" : 0.4,
"完整性" : 0.3,
"可测试性" : 0.2,
"详细性" : 0.1,
}
findTitleName_llm_cfg = {
'model': "qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1',
}
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, name='Assistant')
for i in words:
count += 1
yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}"
chapter, rest = i.split('-', 1)
title, text = rest.split('', 1)
# 生成字典
example = {
"chapter": chapter.strip(),
"title": title.strip(),
"text": text.strip()
}
result = {
"title": title.strip(),
"text": text.strip()
}
# 循环提取键和值
weighted_score = 0
for key, value in standard.items():
prompt_score = f"""对软件功能{key}的定义:
{value}
模块名称{example['title']}
模块描述{example['text']}
回答格式为{{"模块名称""{example['text']}",
"等级":"优秀/良好/一般/差",
"得分":"0~100",
"理由及扣分原因":"理由及扣分原因",
}}不做过多的解释,严格按回答格式作答,只给出一个回答
"""
messages = [({'role': 'user', 'content': prompt_score})]
runList = []
for rsp in findTitleName_bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
if isinstance(parsed_data, list): # 检查parsed_data是否为列表
parsed_data = parsed_data[0] # 取第一个元素
else:
parsed_data = parsed_data
result[f"{key}等级"] = parsed_data['等级']
result[f"{key}得分"] = parsed_data['得分']
score = int(parsed_data['得分']) # 假设 '得分' 是字符串,需要转换为整数
key_weight = weight.get(key, 0) # 根据键获取权重,如果没有匹配的权重,默认为 0
# 计算加权得分并累加
weighted_score += score * key_weight
result["加权得分"] = round(weighted_score, 2) # 保留两位小数
answer = f"{example['text']}"
for key, value in standard.items():
prompt_answer = f"""对软件功能{key}的定义:\n
{value}\n
模块名称{example['title']}\n
模块描述f{answer}\n
回答格式为{{"模块名称""{example['text']}",
"改进后的描述":"改进后的描述",
}}不做过多的解释,严格按回答格式作答
"""
messages = [({'role': 'user', 'content': prompt_answer})]
runList = []
for rsp in findTitleName_bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
answer = parsed_data['改进后的描述']
result["改进后的描述"] = answer
textTag = i.split("")[0]
breakpoint()
# vectorstore.delete(ids=uuids)
shutil.rmtree(vector_store_path)
resInfo=f"{titleName}章节,发现相似内容:<br>"
if(len(reslist)>0):
for res in reslist:
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find('')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find('') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find('')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find('') + 1:]+"<br>以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】<br>"
yield resInfo
else:
yield "**未发现相似内容**"
userLog.info("文档相似性检查----未发现相似内容**")
# 创建一个记录器 for i in checkRepeatText("./北仑区综合行政执法局协同监管系统项目建设方案_20240824.docx"):
logger = logging.getLogger('my_logger') print(i)
logger.setLevel(logging.DEBUG)
# 创建一个处理器
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# 创建一个格式化器并将其添加到处理器中
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# 将处理器添加到记录器中
logger.addHandler(ch)
try:
# 记录一些日志消息
logger.debug('这是一个调试消息')
logger.info('这是一个信息消息')
logger.warning('这是一个警告消息')
logger.error('这是一个错误消息')
logger.critical('这是一个致命错误消息')
except Exception as e:
logger.warning(e)

408
main.py

@ -1,206 +1,286 @@
from flask import Flask, request, jsonify, Response # from flask import Flask, request, jsonify, Response
import os import os
from checkPlaceName import checkPlaceName from checkPlaceName import checkPlaceName
from checkRepeatText import checkRepeatText from checkRepeatText import checkRepeatText
from checkCompanyName import checkCompanyName from checkCompanyName import checkCompanyName
from checkDocumentError import checkDocumentError from checkDocumentError import checkDocumentError
from checkTitleName import checkTitleName from checkTitleName import checkTitleName
from flask_cors import CORS # from flask_cors import CORS
import qwen_agenttext import qwen_agenttext
from myLogger import outLog from myLogger import outLog
import time import time
app = Flask(__name__) # app = Flask(__name__)
cros = CORS(app) # cros = CORS(app)
import uvicorn
from fastapi import FastAPI, Request, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from sse_starlette.sse import EventSourceResponse
import asyncio
app = FastAPI()
# 允许所有来源的跨域请求
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"]
)
UPLOAD_FOLDER = 'uploads' UPLOAD_FOLDER = 'uploads'
if not os.path.exists(UPLOAD_FOLDER): if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER) os.makedirs(UPLOAD_FOLDER)
@app.route('/upload', methods=['POST']) # @app.route('/upload', methods=['POST'])
def upload_file(): # def upload_file():
if 'file' not in request.files: # if 'file' not in request.files:
return jsonify({"error": "No file part"}), 400 # return jsonify({"error": "No file part"}), 400
file = request.files['file'] # file = request.files['file']
if file.filename == '': # if file.filename == '':
return jsonify({"error": "No selected file"}), 400 # return jsonify({"error": "No selected file"}), 400
if file: # if file:
filename = file.filename # filename = file.filename
file.save(os.path.join(UPLOAD_FOLDER, filename)) # file.save(os.path.join(UPLOAD_FOLDER, filename))
return jsonify({"message": "File uploaded successfully"}), 200 # return jsonify({"message": "File uploaded successfully"}), 200
@app.post("/sse/upload")
async def upload_file(file: UploadFile = File(...)):
@app.route('/stream', methods=["GET", "POST"]) if not file.filename:
def stream_numbers(): raise HTTPException(status_code=400, detail="No selected file")
context = request.args.get('context')
# def generate_numbers(): # 保存文件
# event_id=0 try:
# for number in range(1, 10): file_location = os.path.join(UPLOAD_FOLDER, file.filename)
# json_data = json.dumps({"number": number}) with open(file_location, "wb") as f:
# print(json_data) content = await file.read()
# event_id += 1 f.write(content)
# yield f"id: {event_id}\n" return JSONResponse(content={"message": "文件上传成功"}, status_code=200)
# yield f"event: time-update\n" except Exception as e:
# yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 raise HTTPException(status_code=500, detail="文件上传失败,错误信息:" + str(e))
# time.sleep(0.5) # 为了演示,加入短暂延迟
# json_data = json.dumps({"number": "done"})
# yield f"id: {1}\n" @app.get("/sse")
# yield f"event: time-update\n" async def root(request: Request):
# yield f"data: {json_data}\n\n" # 发送完成信号 async def event_generator(request: Request):
res_str = "七夕情人节即将来临,我们为您准备了精美的鲜花和美味的蛋糕"
headers = { for i in res_str:
"Content-Type": "text/event-stream", if await request.is_disconnected():
"Cache-Control": "no-cache", print("连接已中断")
"X-Accel-Buffering": "no", break
"Access-Control-Allow-Origin": "*", yield {
"Access-Control-Allow-Methods": "GET,POST", "event": "message",
"Access-Control-Allow-Headers": "x-requested-with,content-type", "id": "7",
"data": f"{i}"
} }
return Response(qwen_agenttext.getxinx(context), headers=headers)
await asyncio.sleep(0.1)
g = event_generator(request)
return EventSourceResponse(g)
@app.route('/sse/checkRepeatText', methods=['GET']) # def stream_numbers():
def checkRepeatTextWeb(): # context = request.args.get('context')
filename = request.args.get('filename') # # def generate_numbers():
userId = request.args.get("userId") # # event_id=0
# # for number in range(1, 10):
# # json_data = json.dumps({"number": number})
# # print(json_data)
# # event_id += 1
# # yield f"id: {event_id}\n"
# # yield f"event: time-update\n"
# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送
# # time.sleep(0.5) # 为了演示,加入短暂延迟
# # json_data = json.dumps({"number": "done"})
# # yield f"id: {1}\n"
# # yield f"event: time-update\n"
# # yield f"data: {json_data}\n\n" # 发送完成信号
def generate_checkRepeatText(filename,userId): # headers = {
# "Content-Type": "text/event-stream",
# "Cache-Control": "no-cache",
# "X-Accel-Buffering": "no",
# "Access-Control-Allow-Origin": "*",
# "Access-Control-Allow-Methods": "GET,POST",
# "Access-Control-Allow-Headers": "x-requested-with,content-type",
# }
# return Response(qwen_agenttext.getxinx(context), headers=headers)
@app.get("/sse/checkRepeatText")
async def checkRepeatTextWeb(filename, userId, request: Request):
async def generate_checkRepeatText(filename, userId, request: Request):
global outLog
id = 0 id = 0
for i in checkRepeatText(filename,userId): for i in checkRepeatText(filename, userId, outLog):
yield f"id: {id + 1}\n" id += 1
yield f"event: checkRepeatText\n" if await request.is_disconnected():
yield f"data: {i}\n\n" # 发送完成信号 yield {
# except Exception as e: "id": f"{id}",
"event": "checkRepeatText",
# yield f"id: {id+1}\n" "data": "checkRepeatText连接已中断"
# yield f"event: checkRepeatText\n" }
# yield f"data: **程序出现异常**\n\n" # 发送完成信号 break
yield {
headers = { "id": f"{id}",
"Content-Type": "text/event-stream", "event": "checkRepeatText",
"Cache-Control": "no-cache", "data": i
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
} }
return Response(generate_checkRepeatText(filename,userId), headers=headers)
g = generate_checkRepeatText(filename, userId, request)
return EventSourceResponse(g)
@app.route('/sse/checkPlaceName', methods=['GET']) @app.get('/sse/checkPlaceName')
def checkPlaceNameWebSse(): def checkPlaceNameWebSse(filename, userId, request: Request):
filename = request.args.get('filename') async def generate_checkPlaceName(filename, userId, request: Request):
userId = request.args.get("userId")
def generate_checkPlaceName(filename,userId):
id = 0 id = 0
for i in checkPlaceName(filename,userId): global outLog
yield f"id: {id + 1}\n" for i in checkPlaceName(filename, userId, outLog):
yield f"event: checkPlaceName\n" id += 1
yield f"data: {i}\n\n" # 发送完成信号 if await request.is_disconnected():
yield {
headers = { "id": f"{id}",
"Content-Type": "text/event-stream", "event": "checkPlaceName",
"Cache-Control": "no-cache", "data": "checkPlaceName连接已中断"
"X-Accel-Buffering": "no", }
"Access-Control-Allow-Origin": "*", break
"Access-Control-Allow-Methods": "GET,POST", yield {
"Access-Control-Allow-Headers": "x-requested-with,content-type", "id": f"{id}",
"event": "checkPlaceName",
"data": i
} }
return Response(generate_checkPlaceName(filename,userId), headers=headers)
g = generate_checkPlaceName(filename, userId, request)
return EventSourceResponse(g)
@app.route('/sse/checkCompanyName', methods=['GET'])
def checkCompanyNameWebSse(): @app.get('/sse/checkCompanyName')
filename = request.args.get('filename') def checkCompanyNameWebSse(filename, userId, request: Request):
userId = request.args.get("userId") async def generate_checkCompanyName(filename, userId, request: Request):
def generate_checkCompanyName(filename,userId):
id = 0 id = 0
for i in checkCompanyName(filename,userId): global outLog
yield f"id: {id + 1}\n" for i in checkCompanyName(filename, userId, outLog):
yield f"event: checkCompanyName\n" id += 1
yield f"data: {i}\n\n" # 发送完成信号 if await request.is_disconnected():
yield {
headers = { "id": f"{id}",
"Content-Type": "text/event-stream", "event": "checkCompanyName",
"Cache-Control": "no-cache", "data": "checkCompanyName连接已中断"
"X-Accel-Buffering": "no", }
"Access-Control-Allow-Origin": "*", break
"Access-Control-Allow-Methods": "GET,POST", yield {
"Access-Control-Allow-Headers": "x-requested-with,content-type", "id": f"{id}",
"event": "checkCompanyName",
"data": i
} }
return Response(generate_checkCompanyName(filename,userId), headers=headers)
g = generate_checkCompanyName(filename, userId, request)
return EventSourceResponse(g)
@app.route('/sse/checkDocumentErrorWeb', methods=['GET']) @app.get('/sse/checkDocumentErrorWeb')
def checkDocumentErrorWebSse(): def checkDocumentErrorWebSse(filename, userId, request: Request):
filename = request.args.get('filename') async def generate_checkDocumentError(filename, userId, request: Request):
userId = request.args.get("userId")
def generate_checkDocumentError(filename,userId):
id = 0 id = 0
for i in checkDocumentError(filename,userId): global outLog
yield f"id: {id + 1}\n" for i in checkDocumentError(filename, userId, outLog):
yield f"event: checkDocumentError\n" id += 1
yield f"data: {i}\n\n" # 发送完成信号 if await request.is_disconnected():
yield {
headers = { "id": f"{id}",
"Content-Type": "text/event-stream", "event": "checkDocumentError",
"Cache-Control": "no-cache", "data": "checkDocumentError连接已中断"
"X-Accel-Buffering": "no", }
"Access-Control-Allow-Origin": "*", break
"Access-Control-Allow-Methods": "GET,POST", yield {
"Access-Control-Allow-Headers": "x-requested-with,content-type", "id": f"{id}",
"event": "checkDocumentError",
"data": i
} }
return Response(generate_checkDocumentError(filename,userId), headers=headers)
g = generate_checkDocumentError(filename, userId, request)
return EventSourceResponse(g)
@app.route('/sse/checkTitleName', methods=['GET'])
def checkTitleNameWebSse(): @app.get('/sse/checkTitleName')
filename = request.args.get('filename') def checkTitleNameWebSse(filename, userId, request: Request):
userId = request.args.get("userId") async def generate_checkTitleName(filename, userId, request: Request):
def generate_checkTitleName(filename,userId):
id = 0 id = 0
for i in checkTitleName(filename,userId): global outLog
yield f"id: {id + 1}\n" for i in checkTitleName(filename, userId, outLog):
yield f"event: checkTitleName\n" id += 1
yield f"data: {i}\n\n" # 发送完成信号 if await request.is_disconnected():
yield {
headers = { "id": f"{id}",
"Content-Type": "text/event-stream", "event": "checkTitleName",
"Cache-Control": "no-cache", "data": "checkTitleName连接已中断"
"X-Accel-Buffering": "no", }
"Access-Control-Allow-Origin": "*", break
"Access-Control-Allow-Methods": "GET,POST", yield {
"Access-Control-Allow-Headers": "x-requested-with,content-type", "id": f"{id}",
"event": "checkTitleName",
"data": i
} }
return Response(generate_checkTitleName(filename,userId), headers=headers)
@app.route('/sse/getLog', methods=['GET']) g = generate_checkTitleName(filename, userId, request)
def getlog(): return EventSourceResponse(g)
userId = request.args.get("userId")
def generate_getLog(userId):
time.sleep(1) @app.get("/sse/getLog")
# @app.route('/sse/getLog', methods=['GET'])
async def getlog(userId, request: Request):
# userId = request.args.get("userId")
async def generate_getLog(userId):
id = 0 id = 0
global outLog
await asyncio.sleep(5)
while True: while True:
if outLog.is_done(userId): isbreak = outLog.is_done(userId)
if isbreak:
break # 完成了
text = outLog.get_queueData(userId)
if await request.is_disconnected():
yield {
"id": f"{id}",
"event": "checkTitleName",
"data": "checkTitleName连接已中断"
}
break break
q = outLog.get_queueData(userId) if text:
if q: id += 1
id+=1 yield {
text = q.pop(0) "id": id,
yield f"id: {id}\n" "event": "getlog",
yield f"event: getlog\n" "data": text
yield f"data: {text}\n\n" # 发送完成信号 }
yield f"id: {id}\n" # yield f"id: {id}\n"
yield f"event: getlog\n" # yield f"event: getlog\n"
yield f"data: 任务结束!!!!!\n\n" # 发送完成信号 # yield f"data: {text}\n\n" # 发送完成信号
outLog.del_queue(userId) # yield f"id: {id}\n"
headers = { # yield f"event: getlog\n"
"Content-Type": "text/event-stream", # yield f"data: 任务结束!!!!!\n\n" # 发送完成信号
"Cache-Control": "no-cache", yield {
"X-Accel-Buffering": "no", "id": id,
"Access-Control-Allow-Origin": "*", "event": "getlog",
"Access-Control-Allow-Methods": "GET,POST", "data": "任务结束!!!!"
"Access-Control-Allow-Headers": "x-requested-with,content-type",
} }
return Response(generate_getLog(userId), headers=headers) outLog.del_queue(userId)
# headers = {
# "Content-Type": "text/event-stream",
# "Cache-Control": "no-cache",
# "X-Accel-Buffering": "no",
# "Access-Control-Allow-Origin": "*",
# "Access-Control-Allow-Methods": "GET,POST",
# "Access-Control-Allow-Headers": "x-requested-with,content-type",
# }
g = generate_getLog(userId)
return EventSourceResponse(g)
# return Response(generate_getLog(userId), headers=headers)
if __name__ == '__main__': if __name__ == '__main__':
app.run(host="0.0.0.0", port=80) # app.run(host="0.0.0.0", port=80,threaded=True)
# uvicorn.run(app='main:app', host="0.0.0.0", port=80,workers=1)
app.run()

169
myLogger.py

@ -1,117 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
@author: bingyl123@163.com
@version: 1.0.0
@file: OutLog.py
@time: 2023/2/23 20:25
"""
# import logging
# import logging.config
# import re
# import datetime
# import queue
#
#
# class OutLog:
# _instance = None
# logger = None
#
# def __new__(cls):
# if cls._instance is None:
# cls._instance = super(OutLog, cls).__new__(cls)
# cls.logger = logging.getLogger("app") # 默认logger名称为"app"
# cls._instance.queue_dict = {}
# cls._instance.done_dict = {}
# return cls._instance
#
# def get_queue(self, user_id):
# if user_id not in self.queue_dict:
# self.queue_dict[user_id] = []
# self.done_dict[user_id] = {} # 初始化为未完成的字典
# return self.queue_dict[user_id]
#
# def mark_done(self, user_id, producer_name):
# self.done_dict[user_id][producer_name] = True
#
# def is_done(self, user_id):
# return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
# @staticmethod
# def put(item: str, level="INFO"):
# dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# mq.put(f"{dtf}[{level}]: {item}")
#
# @staticmethod
# def debug(item, log=True):
# OutLog.put(item, level="DEBUG")
# if log:
# OutLog._instance.logger.debug(item)
#
# @staticmethod
# def info(item, log=True):
# OutLog.put(item, level="INFO")
# if log:
# OutLog._instance.logger.info(item)
#
# @staticmethod
# def warning(item, log=True):
# OutLog.put(item, level="WARNING")
# if log:
# OutLog._instance.logger.warning(item)
#
# @staticmethod
# def error(item, log=True):
# OutLog.put(item, level="ERROR")
# if log:
# OutLog._instance.logger.error(item)
#
# @staticmethod
# def critical(item, log=True):
# OutLog.put(item, level="CRITICAL")
# if log:
# OutLog._instance.logger.critical(item)
#
#
#
# # 日志配置
# log_config = {
# 'version': 1,
# 'disable_existing_loggers': False,
# 'formatters': {
# 'standard': {
# 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
# },
# },
# 'handlers': {
# 'console': {
# 'class': 'logging.StreamHandler',
# 'formatter': 'standard',
# 'level': logging.INFO,
# },
# 'file': {
# 'class': 'logging.FileHandler',
# 'filename': 'Logger.log',
# 'formatter': 'standard',
# 'level': logging.WARNING,
# },
# },
# 'loggers': {
# '': {
# 'handlers': ['console', 'file'],
# 'level': logging.WARNING,
# 'propagate': True,
# },
# }
# }
#
# logging.config.dictConfig(log_config)
#
# outLog = OutLog() # 获取单例实例
import logging import logging
import logging.config import logging.config
import datetime import datetime
import redis
class OutLog: class OutLog:
_instance = None _instance = None
@ -121,35 +12,49 @@ class OutLog:
if cls._instance is None: if cls._instance is None:
cls._instance = super(OutLog, cls).__new__(cls) cls._instance = super(OutLog, cls).__new__(cls)
cls.logger = logging.getLogger("app") # 默认logger名称为"app" cls.logger = logging.getLogger("app") # 默认logger名称为"app"
cls._instance.queue_dict = {} # cls._instance.queue_dict = {}
cls._instance.done_dict = {} # cls._instance.done_dict = {}
# 初始化 Redis 连接
cls._instance.redis_client = redis.StrictRedis(host='localhost', port=6379, password="root",db=0, decode_responses=True)
return cls._instance return cls._instance
def get_queue(self, user_id,producer_name): def get_queue(self,user_id,producer_name):
if user_id not in self.queue_dict: # if user_id not in self.queue_dict:
self.queue_dict[user_id] = [] # self.queue_dict[user_id] = []
self.done_dict[user_id] = {} # 初始化为未完成的字典 # self.done_dict[user_id]={}
if user_id not in self.done_dict: # self.done_dict[user_id][producer_name] = False # 初始化为未完成的字典
self.done_dict[user_id][producer_name] = False # 使用 Redis 进行存储和查询
if not self.redis_client.exists(f"queue:{user_id}"):
# self.redis_client.rpush(f"queue:{user_id}")
self.logger.info(f"queue:{user_id}")
self.redis_client.hset(f"done:{user_id}", producer_name, "0") # 初始化为未完成
return self.UserLogger(user_id) return self.UserLogger(user_id)
def get_queueData(self, user_id): def get_queueData(self, user_id):
if user_id in self.queue_dict: # if user_id in self.queue_dict:
return OutLog._instance.queue_dict[self.user_id] # return self.queue_dict[user_id]
if self.redis_client.exists(f"queue:{user_id}"):
return self.redis_client.lpop(f"queue:{user_id}") # 获取队列首个并删除数据
def del_queue(self,user_id): def del_queue(self,user_id):
# if self.is_done(user_id):
# del self.queue_dict[user_id]
# del self.done_dict[user_id]
if self.is_done(user_id): if self.is_done(user_id):
del self.queue_dict[user_id] self.redis_client.delete(f"queue:{user_id}")
del self.done_dict[user_id] self.redis_client.delete(f"done:{user_id}")
class UserLogger: class UserLogger:
def __init__(self, user_id): def __init__(self, user_id):
self.user_id = user_id self.user_id = user_id
self.logger = OutLog._instance.logger self.logger = OutLog._instance.logger
def log(self, item: str, level: str): def log(self, item: str, level: str):
self._log_to_logger(item, level)
if(level != "INFO"):
return
dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_entry = f"{dtf}[{level}]: {item}" log_entry = f"{dtf}[{level}]: {item}"
OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列 # print(log_entry)
self._log_to_logger(item, level) # OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列
OutLog._instance.redis_client.rpush(f"queue:{self.user_id}", log_entry) # 保存到对应用户的队列
def _log_to_logger(self, item: str, level: str): def _log_to_logger(self, item: str, level: str):
if level == "DEBUG": if level == "DEBUG":
self.logger.debug(item) self.logger.debug(item)
@ -177,11 +82,17 @@ class OutLog:
def critical(self, item: str): def critical(self, item: str):
self.log(item, "CRITICAL") self.log(item, "CRITICAL")
# def mark_done(self, user_id, producer_name):
# self.done_dict[user_id][producer_name] = True
# def is_done(self, user_id):
# # print(self.done_dict.get(user_id, {}),self.done_dict.get(user_id, {}).values())
# return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
def mark_done(self, user_id, producer_name): def mark_done(self, user_id, producer_name):
self.done_dict[user_id][producer_name] = True self.redis_client.hset(f"done:{user_id}", producer_name, "1")
def is_done(self, user_id): def is_done(self, user_id):
return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成 done_dict = self.redis_client.hgetall(f"done:{user_id}")
return all(value == "1" for value in done_dict.values()) if done_dict else False # 检查所有生产者是否完成
# 日志配置 # 日志配置
@ -203,13 +114,13 @@ log_config = {
'class': 'logging.FileHandler', 'class': 'logging.FileHandler',
'filename': 'Logger.log', 'filename': 'Logger.log',
'formatter': 'standard', 'formatter': 'standard',
'level': logging.WARNING, 'level': logging.INFO,
}, },
}, },
'loggers': { 'loggers': {
'': { '': {
'handlers': ['console', 'file'], 'handlers': ['console', 'file'],
'level': logging.WARNING, 'level': logging.INFO,
'propagate': True, 'propagate': True,
}, },
} }

Loading…
Cancel
Save