diff --git a/UserQueue.py b/UserQueue.py
new file mode 100644
index 0000000..e69de29
diff --git a/checkCompanyName.py b/checkCompanyName.py
index ea80b47..4d2f1fd 100644
--- a/checkCompanyName.py
+++ b/checkCompanyName.py
@@ -1,14 +1,15 @@
# -*- coding:utf-8 -*-
-import time
-from docx import Document
-from paddlenlp import Taskflow
+from docx import Document
from qwen_agent.agents import Assistant
import re
import json_repair
+import json
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
-
+import requests
+from myLogger import outLog
+import time
def load_from_xml_v2(baseURI, rels_item_xml):
"""
@@ -28,51 +29,18 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
-
import logging
-import logging.config
-log_config = {
- 'version': 1,
- 'disable_existing_loggers': False,
- 'formatters': {
- 'standard': {
- 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- },
- },
- 'handlers': {
- 'console': {
- 'class': 'logging.StreamHandler',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- 'file': {
- 'class': 'logging.FileHandler',
- 'filename': 'Logger.log',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- },
- 'loggers': {
- '': {
- 'handlers': ['console', 'file'],
- 'level': logging.INFO,
- 'propagate': True,
- },
- }
-}
-
-logging.config.dictConfig(log_config)
-
-logger = logging.getLogger("checkCompanyName")
-prompt = '''
+outLog.logger = logging.getLogger("checkCompanyName")
+userLog=None
+prompt ='''
.根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询,
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案,
回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
'''
llm_cfg = {
- #'model': 'qwen1.5-72b-chat',
- 'model':"qwen2-72b",
+ # 'model': 'qwen1.5-72b-chat',
+ 'model': "qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
@@ -81,32 +49,43 @@ bot = Assistant(llm=llm_cfg,
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"
)
+
def getDocxToTextAll(name):
- docxPath=name
- document = Document(docxPath)
+ docxPath = name
+ loopCount = 0
+ while True:
+ loopCount+=1
+ if(loopCount>=15):
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
# 逐段读取docx文档的内容
- levelList=[]
- words=[]
- addStart = False
- levelText=""
+ words = []
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
- if text.strip():#非空判断
+ if text.strip(): # 非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
text = '\n'.join(words)
-
+ # userLog.info("checkCompanyName----保存文件")
# 将文本写入txt文件
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
+
+
def companyNameTask(text):
yield "文档公司或组织名称检查---启动中...."
- wordtag = Taskflow("knowledge_mining",device_id=0)
- batchNum=20
+ userLog.info("checkCompanyName----启动中....")
+ batchNum = 20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
@@ -122,53 +101,71 @@ def companyNameTask(text):
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}"
-
- wenBen=".".join(chunk)
+ userLog.info(f"checkCompanyName----文档解析进度:{i + 1}/{num_chunks}")
try:
- res = wordtag(wenBen)
+ wenBen = ".".join(chunk)
+ url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
+ headers = {"Content-Type": "application/json"}
+ data = {
+ "data": {
+ "text": wenBen,
+ }
+ }
+ r = requests.post(url=url, headers=headers, data=json.dumps(data))
+ res = json.loads(r.text)
+ # userLog.info(res)
+ # print(res)
except Exception as e:
- logging.warning(chunk)
- logging.warning("文档公司或组织名称检查---词类分析出错",e)
- continue
+ userLog.warning(chunk)
+ userLog.warning("文档公司或组织名称检查--错别字识别出错\n")
+ userLog.warning(e)
+ return
isplace = False
- for zuhe in res[0]['items']:
+ for zuhe in res["result"]:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
- zhi = zuhe.get("wordtag_label")
if isplace:
name = placeList[len(placeList) - 1]
- if zhi.find("组织机构类") >= 0: # or zuhe[1] == "ns"
+ if zuhe[1].find("组织机构类") >= 0: # or zuhe[1] == "ns"
isplace = True
- new_text = zuhe['item'].replace("\n", "")
+ new_text = zuhe[0].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
- if zhi.find("组织机构类") >= 0:
+ if zuhe[1].find("组织机构类") >= 0:
isplace = True
- new_text = zuhe['item'].replace("\n", "")
+ new_text = zuhe[0].replace("\n", "")
placeList.append(new_text)
else:
isplace = False
# 打印总份数
yield "文档公司或组织名称检查---文档解析完成"
- placeList=list(dict.fromkeys(placeList))
+ userLog.info("checkCompanyName----文档解析完成")
+ placeList = list(dict.fromkeys(placeList))
yield placeList
-def checkCompanyName(filename):
+ userLog.info(placeList)
+
+def checkCompanyName(filename,user_id):
yield f"文档公司或组织名称检查---开始处理文档..."
+ global userLog
+ userLog=outLog.get_queue(user_id, "checkCompanyName")
try:
getDocxToTextAll(filename)
except Exception as e:
- logging.warning(e)
+ userLog.warning(e)
+ userLog.warning("文档公司或组织名称检查---文档无法打开,请检查文档内容")
yield "文档公司或组织名称检查---文档无法打开,请检查文档内容"
+ outLog.mark_done(user_id, "checkCompanyName")
return
with open("checkCompanyName.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送
+ userLog.info("checkCompanyName----开始解析文档...")
for item in companyNameTask(gettext):
if isinstance(item, str):
yield item
else:
final_list = item # 获取最终结果
propnStr = ",".join(final_list)
- messages = [{'role': 'user', 'content': [{'text': propnStr+prompt}]}]
+ messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
runList = []
yield f"文档公司或组织名称检查---结果生成中..." # 每次生成一个数字就发送
cishu = 0
@@ -177,29 +174,34 @@ def checkCompanyName(filename):
if cishu > 3:
cishu = 0
yield "文档公司或组织名称检查---结果生成中" + '.' * cishu
+ userLog.info(f"checkCompanyName----结果生成中" + '.' * cishu)
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
- error_places=[]
+ error_places = []
+
for place in parsed_data:
try:
if place['回答'] == '非泛化的公司或组织名称':
error_places.append(place)
except Exception as e:
- logging.warning(place)
- logging.warning("文档公司或组织名称检查---组织提出出错",e)
+ userLog.warning(place)
+ userLog.warning(e)
+ userLog.warning("文档公司或组织名称检查---组织提出出错")
continue
- logging.info(error_places)
+ userLog.info(error_places)
returnInfo = "发现异常公司或组织名称
"
- if len(error_places)>0:
+ if len(error_places) > 0:
for t in error_places:
- keyword= t['companyName'].replace("\n","")
- # 查找包含关键字的段落
+ keyword = t['companyName'].replace("\n", "")
+ # 查找包含关键字的段落
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
- t["yuanwen"]=paragraphs[0]
- yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n","")
+ t["yuanwen"] = paragraphs[0]
+ yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "")
returnInfo += "原文:" + yuanwen + "
异常公司或组织名称:**" + keyword + "**!请注意" + "
"
- logging.info(returnInfo)
+ userLog.info(returnInfo)
yield returnInfo
else:
- yield "**未发现异常公司或组织名称**
"
\ No newline at end of file
+ yield "**未发现异常公司或组织名称**
"
+ userLog.info("**未发现异常公司或组织名称**
")
+ outLog.mark_done(user_id, "checkCompanyName")
\ No newline at end of file
diff --git a/checkDocumentError.py b/checkDocumentError.py
index 2f4614b..33d7ed4 100644
--- a/checkDocumentError.py
+++ b/checkDocumentError.py
@@ -1,19 +1,15 @@
# -*- coding:utf-8 -*-
-# from pycorrector import MacBertCorrector
-# m = MacBertCorrector("shibing624/macbert4csc-base-chinese")
from qwen_agent.agents import Assistant
from docx import Document
-from pprint import pprint
import re
-from paddlenlp import Taskflow
import json
-import time
import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
-
-import asyncio
+import requests
+from myLogger import outLog
+import time
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@@ -32,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
-import logging.config
-
-log_config = {
- 'version': 1,
- 'disable_existing_loggers': False,
- 'formatters': {
- 'standard': {
- 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- },
- },
- 'handlers': {
- 'console': {
- 'class': 'logging.StreamHandler',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- 'file': {
- 'class': 'logging.FileHandler',
- 'filename': 'Logger.log',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- },
- 'loggers': {
- '': {
- 'handlers': ['console', 'file'],
- 'level': logging.INFO,
- 'propagate': True,
- },
- }
-}
-logging.config.dictConfig(log_config)
-
-logger = logging.getLogger("checkDocumentError")
+outLog.logger = logging.getLogger("checkDocumentError")
+userLog=None
llm_cfg = {
# 'model': 'qwen1.5-72b-chat',
'model': "qwen2-72b",
@@ -83,20 +47,28 @@ bot = Assistant(llm=llm_cfg,
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答;
# '''
prompt = '''
-请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出解析,没有错则不用给解析
+请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出详细的解析,没有错则不用给解析
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答;
'''
def getDocxToTextAll(name):
+ userLog.info("checkDocumentError----打开文档")
docxPath = name
- document = Document(docxPath)
+ loopCount = 0
+ while True:
+ loopCount+=1
+ if(loopCount>=15):
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
# 逐段读取docx文档的内容
- levelList = []
words = []
- addStart = False
- levelText = ""
- i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
@@ -112,17 +84,23 @@ def getDocxToTextAll(name):
txt_file.write(text)
-def getDocumentError(filename):
+def checkDocumentError(filename,user_id):
+ global userLog
+ userLog=outLog.get_queue(user_id,"checkDocumentError")
yield f"文档纠错---开始处理文档..."
+ userLog.info("checkDocumentError----开始处理文档...")
try:
getDocxToTextAll(filename)
except Exception as e:
- logger.warning(e)
- yield "文档无法打开,请检查文档内容"
+ userLog.warning(e)
+ userLog.warning("文档纠错----文档无法打开,请检查文档内容")
+ yield "文档纠错----文档无法打开,请检查文档内容"
+ outLog.mark_done(user_id, "checkDocumentError")
return
with open("checkDocumentError.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送
+ userLog.info("checkDocumentError----开始解析文档...")
final_list = []
for item in documentErrorTask(gettext):
if isinstance(item, str):
@@ -135,10 +113,13 @@ def getDocumentError(filename):
yuanwen = i["placeName"].replace("\n", "")
jianyi = i["jianyi"].replace("\n", "")
resInfo += "原文:" + yuanwen + "
建议:**" + jianyi + "**
"
+ userLog.info(resInfo)
yield resInfo
- logger.info(resInfo)
+
else:
yield "**未发现错别字**"
+ userLog.info("未发现错别字")
+ outLog.mark_done(user_id,"checkDocumentError")
def documentErrorTask(text):
@@ -149,7 +130,7 @@ def documentErrorTask(text):
:return: 生成器,每次返回一批文本
"""
yield "文档纠错---启动中...."
- corrector = Taskflow("text_correction", device_id=1)
+ userLog.info("checkDocumentError----启动中....")
batchNum = 20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
@@ -162,18 +143,27 @@ def documentErrorTask(text):
# 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
- placeList = []
# 打印每一份的内容
err = []
for i, chunk in enumerate(chunks):
yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"
+ userLog.info(f"checkDocumentError----文档解析进度:{i + 1}/{num_chunks}")
try:
- res = corrector(chunk)
+ url = "http://0.0.0.0:8190/taskflow/checkDocumentError"
+ headers = {"Content-Type": "application/json"}
+ data = {
+ "data": {
+ "text": chunk,
+ }
+ }
+ r = requests.post(url=url, headers=headers, data=json.dumps(data))
+ res = json.loads(r.text)
+ # print(res)
except Exception as e:
- logger.warning(chunk)
- logger.warning("文档纠错--错别字识别出错\n", e)
+ userLog.warning(chunk)
+ userLog.warning("文档纠错--错别字识别出错\n", e)
continue
- lines_with_greeting = [place for place in res if len(place['errors']) > 0]
+ lines_with_greeting = [place for place in res["result"] if len(place['errors']) > 0]
if len(lines_with_greeting) > 0:
num = 0
wenti = [] # 记录问题的数组
@@ -186,18 +176,20 @@ def documentErrorTask(text):
for key, value in item['correction'].items():
temp_errorWords.append(key)
wenti.append(
- "{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
+ "序号:{},原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
num += 1
words = "\n".join(wenti)
messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]
runList = []
yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送
+ userLog.info(f"checkDocumentError----内容解析中...")
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
if cishu > 3:
cishu = 0
yield "文档纠错---内容解析中" + '.' * cishu
+ userLog.info(f"checkDocumentError----内容解析中内容解析中" + '.' * cishu)
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))
@@ -209,12 +201,13 @@ def documentErrorTask(text):
place["jianyi"] = place["解析"]
resListerr.append(place)
except Exception as e:
- logger.warning(parsed_data)
- logger.warning(place)
- logger.warning("文档纠错--错别字提取出错\n", e)
+ userLog.warning(parsed_data)
+ userLog.warning(place)
+ userLog.warning("文档纠错--错别字提取出错\n", e)
continue
if (len(resListerr) > 0):
err.extend(resListerr)
# 打印总份数
yield "文档地名检查---文档解析完成"
- yield err
\ No newline at end of file
+ userLog.info(err)
+ yield err
diff --git a/checkPlaceName.py b/checkPlaceName.py
index 5b31aa8..851827d 100644
--- a/checkPlaceName.py
+++ b/checkPlaceName.py
@@ -1,15 +1,15 @@
from docx import Document
-from paddlenlp import Taskflow
-from pprint import pprint
from qwen_agent.agents import Assistant
import re
import json_repair
-import time
+import json
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
-
-
+import requests
+import logging
+from myLogger import outLog
+import time
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@@ -29,45 +29,10 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
-import logging
-import logging.config
-
-log_config = {
- 'version': 1,
- 'disable_existing_loggers': False,
- 'formatters': {
- 'standard': {
- 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- },
- },
- 'handlers': {
- 'console': {
- 'class': 'logging.StreamHandler',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- 'file': {
- 'class': 'logging.FileHandler',
- 'filename': 'Logger.log',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- },
- 'loggers': {
- '': {
- 'handlers': ['console', 'file'],
- 'level': logging.INFO,
- 'propagate': True,
- },
- }
-}
-
-logging.config.dictConfig(log_config)
-
-logger = logging.getLogger("checkPlaceName")
-
+outLog.logger = logging.getLogger("checkPlaceName")
+userLog=None
prompt='''
-.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
+.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
不做过多的解释,严格按回答格式作答;
'''
# prompt='''
@@ -87,7 +52,18 @@ bot = Assistant(llm=llm_cfg,
)
#获取全文内容
def getDocxToTextAll(docxPath):
- document = Document(docxPath)
+ loopCount = 0
+ while True:
+ loopCount+=1
+ if(loopCount>=15):
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
# 逐段读取docx文档的内容
levelList=[]
words=[]
@@ -111,7 +87,7 @@ def getDocxToTextAll(docxPath):
#得到全文和地名有关的内容
def placeNameTask(text):
yield "文档地名检查---启动中...."
- tagTask = Taskflow("ner",device_id=2)
+ userLog.info("checkPlaceName----启动中....")
batchNum=20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
@@ -128,16 +104,25 @@ def placeNameTask(text):
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
-
+ userLog.info(f"checkPlaceName----文档解析进度:{i + 1}/{num_chunks}")
wenBen=".".join(chunk)
try:
- res = tagTask(wenBen)
+ url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
+ headers = {"Content-Type": "application/json"}
+ data = {
+ "data": {
+ "text": wenBen,
+ }
+ }
+ r = requests.post(url=url, headers=headers, data=json.dumps(data))
+ res = json.loads(r.text)
except Exception as e:
- logger.warning(chunk)
- logger.warning("文档地名检查---解析地名出错",e)
+ userLog.warning(chunk)
+ userLog.warning("文档地名检查---解析地名出错")
+ userLog.warning(e)
continue
isplace = False
- for zuhe in res:
+ for zuhe in res["result"]:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
if isplace:
name = placeList[len(placeList) - 1]
@@ -154,16 +139,22 @@ def placeNameTask(text):
isplace = False
# 打印总份数
yield "文档地名检查---文档解析完成"
+ userLog.info("checkPlaceName---文档解析完成")
placeList=list(dict.fromkeys(placeList))
yield placeList
+
#主方法
-def checkPlaceName(filename):
+def checkPlaceName(filename,user_id):
+ global userLog
+ userLog=outLog.get_queue(user_id,"checkPlaceName")
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送
try:
getDocxToTextAll(filename)
except Exception as e:
- logger.warning(e)
+ userLog.warning(e)
yield "文档地名检查---文档无法打开,请检查文档内容"
+ userLog.warning("文档地名检查---文档无法打开,请检查文档内容")
+ outLog.mark_done(user_id,"checkPlaceName")
return
with open("checkPlaceName.txt", "r",encoding='utf-8') as f:
gettext = f.read()
@@ -184,6 +175,7 @@ def checkPlaceName(filename):
if cishu>3:
cishu=0
yield "文档地名检查---结果生成中"+'.'*cishu
+ userLog.info("checkPlaceName---结果生成中"+'.'*cishu)
cishu+=1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
@@ -194,10 +186,12 @@ def checkPlaceName(filename):
if place['回答'] == '错误':
error_places.append(place)
except Exception as e:
- logger.warning(place)
- logger.warning("文档地名检查---组织提出出错",e)
+ userLog.warning(parsed_data)
+ userLog.warning(place)
+ userLog.warning("文档地名检查---组织提出出错")
+ userLog.warning(e)
continue
- logger.info(error_places)
+ userLog.info(error_places)
returnInfo = "发现异常地名
"
if len(error_places)>0:
for t in error_places:
@@ -206,7 +200,9 @@ def checkPlaceName(filename):
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
returnInfo+="原文:" + yuanwen + "
出现异常地名:**" + keyword + "**!请注意" + "
"
+ userLog.info(returnInfo)
yield returnInfo
- logger.info(returnInfo)
else:
- yield "**未发现发现异常地名**"
\ No newline at end of file
+ yield "**未发现发现异常地名**"
+ userLog.info("未发现发现异常地名")
+ outLog.mark_done(user_id, "checkPlaceName")
\ No newline at end of file
diff --git a/checkRepeatText.py b/checkRepeatText.py
index 9b462d9..c8688e7 100644
--- a/checkRepeatText.py
+++ b/checkRepeatText.py
@@ -5,7 +5,7 @@ from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qwen_agent.agents import Assistant
import json_repair
-from paddlenlp import Taskflow
+import json
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
device_id=0
import re
@@ -16,41 +16,11 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import logging
import logging.config
+import requests
+from myLogger import outLog
-log_config = {
- 'version': 1,
- 'disable_existing_loggers': False,
- 'formatters': {
- 'standard': {
- 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- },
- },
- 'handlers': {
- 'console': {
- 'class': 'logging.StreamHandler',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- 'file': {
- 'class': 'logging.FileHandler',
- 'filename': 'Logger.log',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- },
- 'loggers': {
- '': {
- 'handlers': ['console', 'file'],
- 'level': logging.INFO,
- 'propagate': True,
- },
- }
-}
-
-logging.config.dictConfig(log_config)
-
-logger = logging.getLogger("checkRepeatText")
-
+outLog.logger = logging.getLogger("checkRepeatText")
+userLog=None
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
@@ -110,7 +80,18 @@ def isTitle(paragraph):
#寻找标题名称
def findTitleName(docxPath):
yield '文档相似性检查----检查是否存在详细设计方案'
- document = Document(docxPath)
+ loopCount = 0
+ while True:
+ loopCount+=1
+ if(loopCount>=15):
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
# 逐段读取docx文档的内容
titleWords=[]
firstTitle = 0
@@ -161,14 +142,24 @@ def findTitleName(docxPath):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
- logger.info(parsed_data)
if(parsed_data["answer"]=="存在"):
yield parsed_data["name"]
else:
yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
#获取文档中 详细设计方案 章节的所有内容
def getDocxToText(docxPath,titleName,vector_store_path):
- document = Document(docxPath)
+ loopCount = 0
+ while True:
+ loopCount+=1
+ if(loopCount>=15):
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
# 逐段读取docx文档的内容
levelList=[]
words=[]
@@ -228,7 +219,9 @@ def getDocxToText(docxPath,titleName,vector_store_path):
# @app.route('/checkRepeatText/', methods=['GET'])
-def checkRepeatText(filename):
+def checkRepeatText(filename,user_id):
+ global userLog
+ userLog=outLog.get_queue(user_id,"checkRepeatText")
yield "文档相似性检查---启动中...."
vector_store_path="vector_store"+str(uuid.uuid4())
for titleName in findTitleName(filename):
@@ -239,13 +232,11 @@ def checkRepeatText(filename):
words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path)
except Exception as e:
yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败"
+ userLog.warning(e)
+ userLog.warning(f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败")
+ outLog.mark_done(user_id, "checkRepeatText")
return
# 记录程序开始的时间戳‘
- global device_id
- similarity = Taskflow("text_similarity",device_id=3)
- # device_id+=1
- # if(device_id>1):
- # device_id=0
reslist = []
count = 0
for i in words:
@@ -259,12 +250,23 @@ def checkRepeatText(filename):
if (textTag.find(tag) >= 0):
continue
try:
- res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]])
+ url = "http://0.0.0.0:8192/taskflow/checkRepeatText"
+ headers = {"Content-Type": "application/json"}
+ data = {
+ "data": {
+ "text": [[i[i.find(':') + 1:], text[text.find(':') + 1:]]],
+ }
+ }
+ r = requests.post(url=url, headers=headers, data=json.dumps(data))
+ res = json.loads(r.text)
+ # res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]])
except Exception as e:
- logger.warning("文档相似性检查--发生异常:",e)
- logger.warning(i)
- logger.warning(text)
- if (res[0]["similarity"] > 0.90):
+ userLog.warning("文档相似性检查--发生异常:")
+ userLog.warning(e)
+ userLog.warning(i)
+ userLog.warning(text)
+ continue
+ if (res["result"][0]["similarity"] > 0.90):
# 判断重复内容是否被放入
if (len(reslist) > 0):
isExist = False
@@ -274,19 +276,20 @@ def checkRepeatText(filename):
break
if not isExist:
# reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]})
- reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
+ userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】")
+ reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
else:
- reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
+ reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
# print(i.split(":")[1] + "\n" + text.split(":")[1])
+ userLog.info("【在"+i[:i.find(':')].replace("\n","")+"下包含:"+i[i.find(':') + 1:].replace("\n","")+"
在"+text[:text.find(':')].replace("\n","")+"**下包含:"+text[text.find(':') + 1:].replace("\n","")+"
以上两段内容相似度:"+'{:.2f}'.format(res["result"][0]["similarity"])+"】")
# vectorstore.delete(ids=uuids)
shutil.rmtree(vector_store_path)
- logger.info("已删除")
- logger.info(reslist)
resInfo=f"对{titleName}章节,发现相似内容:
"
if(len(reslist)>0):
for res in reslist:
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"
在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"
以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】
"
yield resInfo
- logger.info(resInfo)
else:
- yield "未发现相似内容"
+ yield "**未发现相似内容**"
+ userLog.info("文档相似性检查----未发现相似内容**")
+ outLog.mark_done(user_id, "checkRepeatText")
\ No newline at end of file
diff --git a/checkTitleName.py b/checkTitleName.py
index cfba113..7a0c25b 100644
--- a/checkTitleName.py
+++ b/checkTitleName.py
@@ -1,3 +1,5 @@
+import time
+
from docx import Document
from pprint import pprint
from qwen_agent.agents import Assistant
@@ -6,7 +8,7 @@ import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
-
+from myLogger import outLog
def load_from_xml_v2(baseURI, rels_item_xml):
"""
@@ -26,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
-import logging.config
-
-log_config = {
- 'version': 1,
- 'disable_existing_loggers': False,
- 'formatters': {
- 'standard': {
- 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
- },
- },
- 'handlers': {
- 'console': {
- 'class': 'logging.StreamHandler',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- 'file': {
- 'class': 'logging.FileHandler',
- 'filename': 'Logger.log',
- 'formatter': 'standard',
- 'level': logging.INFO,
- },
- },
- 'loggers': {
- '': {
- 'handlers': ['console', 'file'],
- 'level': logging.INFO,
- 'propagate': True,
- },
- }
-}
-logging.config.dictConfig(log_config)
-
-logger = logging.getLogger("checkCompanyName")
+outLog.logger = logging.getLogger("checkTitleName")
+userLog=None
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
@@ -113,7 +83,18 @@ def isTitle(paragraph):
#获取文档中 详细设计方案 章节的所有内容
def getDocxToTitleName(docxPath):
- document = Document(docxPath)
+ loopCount = 0
+ while True:
+ loopCount+=1
+ if(loopCount>=15):
+ raise Exception("文档读取超时,或文档存在问题无法读取")
+ break
+ try:
+ document = Document(docxPath)
+ break
+ except Exception as e:
+ time.sleep(1)
+ pass
# 逐段读取docx文档的内容
levelList=[]
words=[]
@@ -130,9 +111,11 @@ def getDocxToTitleName(docxPath):
words.append(text)
return words
-def checkTitleName(filename):
-
+def checkTitleName(filename,user_id):
+ global userLog
+ userLog=outLog.get_queue(user_id,"checkTitleName")
yield '文档结构检查----启动中'
+ userLog.info("checkTitleName----启动中")
with open("ce模板.txt", "r",encoding='utf-8') as f:
gettext = f.readlines()
count=0
@@ -140,8 +123,10 @@ def checkTitleName(filename):
try:
word = getDocxToTitleName(filename)
except Exception as e:
- print(e)
- yield "文档无法打开,请检查文档内容"
+ userLog.warning(e)
+ yield "文档结构检查----文档无法打开,请检查文档内容"
+ outLog.mark_done(user_id, "checkTitleName")
+ userLog.warning("checkTitleName----文档无法打开,请检查文档内容")
return
for text in gettext:
count+=1
@@ -150,24 +135,25 @@ def checkTitleName(filename):
'''
xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
+ userLog.info(f"checkTitleName----结构分析中{count}/{len(gettext)}")
strword = "\n".join(word)+prompt+xushang
- # print(strword)
messages = [{'role': 'user', 'content': [{'text':strword}]}]
runList = []
- cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
# print(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
- print(parsed_data)
if(parsed_data["answer"]=="不存在"):
reserr.append(text)
+
resInfo="文档结构存在异常:
"
if(len(reserr)>0):
for i in reserr:
resInfo+="**"+i.replace('\n','')+"**
"
- logger.info(resInfo)
+ userLog.info(resInfo)
yield resInfo
else:
yield "文档结构未发现异常"
+ userLog.info("文档结构未发现异常")
+ outLog.mark_done(user_id, "checkTitleName")
diff --git a/main.py b/main.py
index 33d1f8d..8e89845 100644
--- a/main.py
+++ b/main.py
@@ -1,18 +1,21 @@
-from flask import Flask, request, jsonify,Response
+from flask import Flask, request, jsonify, Response
import os
from checkPlaceName import checkPlaceName
from checkRepeatText import checkRepeatText
from checkCompanyName import checkCompanyName
-from checkDocumentError import getDocumentError
+from checkDocumentError import checkDocumentError
from checkTitleName import checkTitleName
from flask_cors import CORS
import qwen_agenttext
+from myLogger import outLog
+import time
app = Flask(__name__)
cros = CORS(app)
UPLOAD_FOLDER = 'uploads'
-usableTag=[0,0,0,0,0,0,0,0]
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
+
+
@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
@@ -22,11 +25,13 @@ def upload_file():
return jsonify({"error": "No selected file"}), 400
if file:
filename = file.filename
- file.save(os.path.join(UPLOAD_FOLDER,filename))
+ file.save(os.path.join(UPLOAD_FOLDER, filename))
return jsonify({"message": "File uploaded successfully"}), 200
-@app.route('/stream' ,methods=["GET", "POST"])
+
+
+@app.route('/stream', methods=["GET", "POST"])
def stream_numbers():
- context= request.args.get('context')
+ context = request.args.get('context')
# def generate_numbers():
# event_id=0
# for number in range(1, 10):
@@ -50,22 +55,26 @@ def stream_numbers():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
- return Response(qwen_agenttext.getxinx(context),headers=headers)
+ return Response(qwen_agenttext.getxinx(context), headers=headers)
+
+
@app.route('/sse/checkRepeatText', methods=['GET'])
def checkRepeatTextWeb():
filename = request.args.get('filename')
+ userId = request.args.get("userId")
- def generate_checkRepeatText(filename):
- id=0
- try:
- for i in checkRepeatText(filename):
- yield f"id: {id+1}\n"
- yield f"event: checkRepeatText\n"
- yield f"data: {i}\n\n" # 发送完成信号
- except Exception as e:
- yield f"id: {id+1}\n"
+ def generate_checkRepeatText(filename,userId):
+ id = 0
+ for i in checkRepeatText(filename,userId):
+ yield f"id: {id + 1}\n"
yield f"event: checkRepeatText\n"
- yield f"data: **程序出现异常**\n\n" # 发送完成信号
+ yield f"data: {i}\n\n" # 发送完成信号
+ # except Exception as e:
+
+ # yield f"id: {id+1}\n"
+ # yield f"event: checkRepeatText\n"
+ # yield f"data: **程序出现异常**\n\n" # 发送完成信号
+
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
@@ -74,19 +83,20 @@ def checkRepeatTextWeb():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
- return Response(generate_checkRepeatText(filename), headers=headers)
+ return Response(generate_checkRepeatText(filename,userId), headers=headers)
@app.route('/sse/checkPlaceName', methods=['GET'])
def checkPlaceNameWebSse():
filename = request.args.get('filename')
-
- def generate_checkPlaceName(filename):
- id=0
- for i in checkPlaceName(filename):
- yield f"id: {id+1}\n"
+ userId = request.args.get("userId")
+ def generate_checkPlaceName(filename,userId):
+ id = 0
+ for i in checkPlaceName(filename,userId):
+ yield f"id: {id + 1}\n"
yield f"event: checkPlaceName\n"
yield f"data: {i}\n\n" # 发送完成信号
+
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
@@ -95,14 +105,16 @@ def checkPlaceNameWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
- return Response(generate_checkPlaceName(filename), headers=headers)
+ return Response(generate_checkPlaceName(filename,userId), headers=headers)
+
+
@app.route('/sse/checkCompanyName', methods=['GET'])
def checkCompanyNameWebSse():
filename = request.args.get('filename')
-
- def generate_checkCompanyName(filename):
+ userId = request.args.get("userId")
+ def generate_checkCompanyName(filename,userId):
id = 0
- for i in checkCompanyName(filename):
+ for i in checkCompanyName(filename,userId):
yield f"id: {id + 1}\n"
yield f"event: checkCompanyName\n"
yield f"data: {i}\n\n" # 发送完成信号
@@ -115,17 +127,18 @@ def checkCompanyNameWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
- return Response(generate_checkCompanyName(filename), headers=headers)
+ return Response(generate_checkCompanyName(filename,userId), headers=headers)
+
@app.route('/sse/checkDocumentErrorWeb', methods=['GET'])
def checkDocumentErrorWebSse():
filename = request.args.get('filename')
-
- def generate_checkDocumentError(filename):
+ userId = request.args.get("userId")
+ def generate_checkDocumentError(filename,userId):
id = 0
- for i in getDocumentError(filename):
+ for i in checkDocumentError(filename,userId):
yield f"id: {id + 1}\n"
- yield f"event: getDocumentError\n"
+ yield f"event: checkDocumentError\n"
yield f"data: {i}\n\n" # 发送完成信号
headers = {
@@ -136,14 +149,16 @@ def checkDocumentErrorWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
- return Response(generate_checkDocumentError(filename), headers=headers)
+ return Response(generate_checkDocumentError(filename,userId), headers=headers)
+
+
@app.route('/sse/checkTitleName', methods=['GET'])
def checkTitleNameWebSse():
filename = request.args.get('filename')
-
- def generate_checkTitleName(filename):
+ userId = request.args.get("userId")
+ def generate_checkTitleName(filename,userId):
id = 0
- for i in checkTitleName(filename):
+ for i in checkTitleName(filename,userId):
yield f"id: {id + 1}\n"
yield f"event: checkTitleName\n"
yield f"data: {i}\n\n" # 发送完成信号
@@ -156,6 +171,36 @@ def checkTitleNameWebSse():
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
- return Response(generate_checkTitleName(filename), headers=headers)
+ return Response(generate_checkTitleName(filename,userId), headers=headers)
+
+@app.route('/sse/getLog', methods=['GET'])
+def getlog():
+ userId = request.args.get("userId")
+ def generate_getLog(userId):
+ time.sleep(1)
+ id = 0
+ while True:
+ if outLog.is_done(userId):
+ break
+ q = outLog.get_queueData(userId)
+ if q:
+ id+=1
+ text = q.pop(0)
+ yield f"id: {id}\n"
+ yield f"event: getlog\n"
+ yield f"data: {text}\n\n" # 发送完成信号
+ yield f"id: {id}\n"
+ yield f"event: getlog\n"
+ yield f"data: 任务结束!!!!!\n\n" # 发送完成信号
+ outLog.del_queue(userId)
+ headers = {
+ "Content-Type": "text/event-stream",
+ "Cache-Control": "no-cache",
+ "X-Accel-Buffering": "no",
+ "Access-Control-Allow-Origin": "*",
+ "Access-Control-Allow-Methods": "GET,POST",
+ "Access-Control-Allow-Headers": "x-requested-with,content-type",
+ }
+ return Response(generate_getLog(userId), headers=headers)
if __name__ == '__main__':
- app.run(host="0.0.0.0",port=80)
\ No newline at end of file
+ app.run(host="0.0.0.0", port=80)
diff --git a/myLogger.py b/myLogger.py
new file mode 100644
index 0000000..6ea3059
--- /dev/null
+++ b/myLogger.py
@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+"""
+@author: bingyl123@163.com
+@version: 1.0.0
+@file: OutLog.py
+@time: 2023/2/23 20:25
+"""
+# import logging
+# import logging.config
+# import re
+# import datetime
+# import queue
+#
+#
+# class OutLog:
+# _instance = None
+# logger = None
+#
+# def __new__(cls):
+# if cls._instance is None:
+# cls._instance = super(OutLog, cls).__new__(cls)
+# cls.logger = logging.getLogger("app") # 默认logger名称为"app"
+# cls._instance.queue_dict = {}
+# cls._instance.done_dict = {}
+# return cls._instance
+#
+# def get_queue(self, user_id):
+# if user_id not in self.queue_dict:
+# self.queue_dict[user_id] = []
+# self.done_dict[user_id] = {} # 初始化为未完成的字典
+# return self.queue_dict[user_id]
+#
+# def mark_done(self, user_id, producer_name):
+# self.done_dict[user_id][producer_name] = True
+#
+# def is_done(self, user_id):
+# return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
+# @staticmethod
+# def put(item: str, level="INFO"):
+# dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+# mq.put(f"{dtf}[{level}]: {item}")
+#
+# @staticmethod
+# def debug(item, log=True):
+# OutLog.put(item, level="DEBUG")
+# if log:
+# OutLog._instance.logger.debug(item)
+#
+# @staticmethod
+# def info(item, log=True):
+# OutLog.put(item, level="INFO")
+# if log:
+# OutLog._instance.logger.info(item)
+#
+# @staticmethod
+# def warning(item, log=True):
+# OutLog.put(item, level="WARNING")
+# if log:
+# OutLog._instance.logger.warning(item)
+#
+# @staticmethod
+# def error(item, log=True):
+# OutLog.put(item, level="ERROR")
+# if log:
+# OutLog._instance.logger.error(item)
+#
+# @staticmethod
+# def critical(item, log=True):
+# OutLog.put(item, level="CRITICAL")
+# if log:
+# OutLog._instance.logger.critical(item)
+#
+#
+#
+# # 日志配置
+# log_config = {
+# 'version': 1,
+# 'disable_existing_loggers': False,
+# 'formatters': {
+# 'standard': {
+# 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+# },
+# },
+# 'handlers': {
+# 'console': {
+# 'class': 'logging.StreamHandler',
+# 'formatter': 'standard',
+# 'level': logging.INFO,
+# },
+# 'file': {
+# 'class': 'logging.FileHandler',
+# 'filename': 'Logger.log',
+# 'formatter': 'standard',
+# 'level': logging.WARNING,
+# },
+# },
+# 'loggers': {
+# '': {
+# 'handlers': ['console', 'file'],
+# 'level': logging.WARNING,
+# 'propagate': True,
+# },
+# }
+# }
+#
+# logging.config.dictConfig(log_config)
+#
+# outLog = OutLog() # 获取单例实例
+
+
+
+import logging
+import logging.config
+import datetime
+
+class OutLog:
+ _instance = None
+ logger = None
+
+ def __new__(cls):
+ if cls._instance is None:
+ cls._instance = super(OutLog, cls).__new__(cls)
+ cls.logger = logging.getLogger("app") # 默认logger名称为"app"
+ cls._instance.queue_dict = {}
+ cls._instance.done_dict = {}
+ return cls._instance
+
+ def get_queue(self, user_id,producer_name):
+ if user_id not in self.queue_dict:
+ self.queue_dict[user_id] = []
+ self.done_dict[user_id] = {} # 初始化为未完成的字典
+ if user_id not in self.done_dict:
+ self.done_dict[user_id][producer_name] = False
+ return self.UserLogger(user_id)
+ def get_queueData(self, user_id):
+ if user_id in self.queue_dict:
+ return OutLog._instance.queue_dict[self.user_id]
+ def del_queue(self,user_id):
+ if self.is_done(user_id):
+ del self.queue_dict[user_id]
+ del self.done_dict[user_id]
+ class UserLogger:
+ def __init__(self, user_id):
+ self.user_id = user_id
+ self.logger = OutLog._instance.logger
+
+ def log(self, item: str, level: str):
+ dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ log_entry = f"{dtf}[{level}]: {item}"
+ OutLog._instance.queue_dict[self.user_id].append(log_entry) # 保存到对应用户的队列
+ self._log_to_logger(item, level)
+
+ def _log_to_logger(self, item: str, level: str):
+ if level == "DEBUG":
+ self.logger.debug(item)
+ elif level == "INFO":
+ self.logger.info(item)
+ elif level == "WARNING":
+ self.logger.warning(item)
+ elif level == "ERROR":
+ self.logger.error(item)
+ elif level == "CRITICAL":
+ self.logger.critical(item)
+
+ def info(self, item: str):
+ self.log(item, "INFO")
+
+ def warning(self, item: str):
+ self.log(item, "WARNING")
+
+ def debug(self, item: str):
+ self.log(item, "DEBUG")
+
+ def error(self, item: str):
+ self.log(item, "ERROR")
+
+ def critical(self, item: str):
+ self.log(item, "CRITICAL")
+
+ def mark_done(self, user_id, producer_name):
+ self.done_dict[user_id][producer_name] = True
+
+ def is_done(self, user_id):
+ return all(self.done_dict.get(user_id, {}).values()) # 检查所有生产者是否完成
+
+
+# 日志配置
+log_config = {
+ 'version': 1,
+ 'disable_existing_loggers': False,
+ 'formatters': {
+ 'standard': {
+ 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+ },
+ },
+ 'handlers': {
+ 'console': {
+ 'class': 'logging.StreamHandler',
+ 'formatter': 'standard',
+ 'level': logging.INFO,
+ },
+ 'file': {
+ 'class': 'logging.FileHandler',
+ 'filename': 'Logger.log',
+ 'formatter': 'standard',
+ 'level': logging.WARNING,
+ },
+ },
+ 'loggers': {
+ '': {
+ 'handlers': ['console', 'file'],
+ 'level': logging.WARNING,
+ 'propagate': True,
+ },
+ }
+}
+
+logging.config.dictConfig(log_config)
+
+outLog = OutLog() # 获取单例实例
\ No newline at end of file
diff --git a/test.py b/test.py
index 3bda934..06be8df 100644
--- a/test.py
+++ b/test.py
@@ -1,109 +1,79 @@
-import time
-import json
-import math
-from flask import Flask,Response,request
-from flask_sse import sse
-from flask_cors import CORS
-import re
-import qwen_agenttext
-app = Flask(__name__)
-cros = CORS(app)
-# SSE 推送函数
-import paddle;
-paddle.device.get_available_device()
+# -*- coding:utf-8 -*-
+# from spire.doc import *
+# from spire.doc.common import *
+#
+# # 创建一个 Document 对象
+# document = Document()
+# # 加载一个 Word DOCX 文档
+# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc")
+# document.LoadFromFile("D:\\数据集\\数据集\\3.doc")
+# print(document.Sections.Count)
+# for i in range(document.Sections.Count):
+# section=document.Sections[i]
+# for x in range(section.Paragraphs.Count):
+# paragraph=section.Paragraphs[x]
+# print(paragraph.Text)
+# print("---------------------------------")
+# # 或加载一个 Word DOC 文档
+# # document.LoadFromFile("1223.xml")
+#
+# # # # 设置是否在 HTML 中嵌入图片
+# # document.HtmlExportOptions.ImageEmbedded = True
+# # # document.XHTMLValidateOption.ImageEmbedded = True
+# # #
+# # # # 设置是否将表单字段导出为纯文本在 HTML 中显示
+# # document.HtmlExportOptions.IsTextInputFormFieldAsText = True
+# # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True
+# # #
+# # # # 设置是否在 HTML 中导出页眉和页脚
+# # document.HtmlExportOptions.HasHeadersFooters = False
+# # # document.XHTMLValidateOption.HasHeadersFooters = True
+# #
+# # # 将 Word 文档保存为 HTML 文件
+# # document.SaveToFile("1223.html", FileFormat.Html)
+# # #
+# document.Close()
+from bs4 import BeautifulSoup
+# 读取HTML文件
+with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file:
+ html_content = file.read()
+# 解析HTML文档
+soup = BeautifulSoup(html_content, 'html.parser')
-# SSE 推送路由
+# 用于存储结果的字典
+headings = {}
+current_heading = None
+# 遍历所有的h1, h2, h3等标题
+for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]):
+ level = int(element.name[1]) # 获取标题级别
+ title = element.get_text(strip=True) # 获取标题文本
-# @app.route('/register', methods=["GET"])
-# def register():
- # 获取客户端标识符
- # client_id = str(uuid.uuid4())
- #
- # # 返回 SSE 响应
- # return jsonify({"client_id": client_id})
+ # 设置当前标题
+ current_heading = {
+ 'title': title,
+ 'level': level,
+ 'content': []
+ }
+ # 将当前标题添加到字典中
+ headings[title] = current_heading
-# SSE 推送路由
+ # 寻找当前标题下的内容
+ next_element = element.find_next_sibling()
+ while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]:
+ # 判断内容的标签
+ if next_element.name in ['p', 'div']:
+ current_heading['content'].append(next_element.get_text(strip=False))
+ next_element = next_element.find_next_sibling()
+
+# 输出结果
+for heading in headings.values():
+ print(f"标题: {heading['title']} (级别: {heading['level']})")
+ print("内容:")
+ for content in heading['content']:
+ print(f" - {content}")
+ print()
-# @app.route('/sse', methods=['POST'])
-# def stream():
-# # 获取客户端标识符
-# client_id = 1
-# print("client_id", client_id)
-#
-# def aa():
-# # 循环发送 SSE 数据
-# for i in range(10):
-# data = 'Hello, %s!' % client_id + str(i)
-# print(data)
-# sse.publish(data, channel=client_id, type='message')
-# time.sleep(1)
-# sse.publish("end", channel=client_id, type='message')
-#
-# # 返回 SSE 响应
-# response = Response(aa(), mimetype='text/event-stream')
-# response.headers.add('Cache-Control', 'no-cache')
-# response.headers.add('Connection', 'keep-alive')
-# response.headers.add('X-Accel-Buffering', 'no')
-# return response
-#
-#
-#
-# @app.route('/stream' ,methods=["GET", "POST"])
-# def stream_numbers():
-# context= request.args.get('context')
-#
-#
-# headers = {
-# "Content-Type": "text/event-stream",
-# "Cache-Control": "no-cache",
-# "X-Accel-Buffering": "no",
-# "Access-Control-Allow-Origin": "*",
-# "Access-Control-Allow-Methods": "GET,POST",
-# "Access-Control-Allow-Headers": "x-requested-with,content-type",
-# }
-# return Response(generate_numbers(),headers=headers)
-# def generate_numbers():
-# event_id=0
-# # for number in range(1, 10):
-# # json_data = json.dumps({"number": number})
-# # print(json_data)
-# # event_id += 1
-# # yield f"id: {event_id}\n"
-# # yield f"event: time-update\n"
-# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送
-# json_data = json.dumps({"number": "done"})
-# yield f"id: {1}\n"
-# yield f"event: time-update\n"
-# yield f"data: 34568\n\n" # 发送完成信号
-# if __name__ == '__main__':
-#
-#
-# # 读取文件内容
-# with open("checkPlaceName.txt", "r", encoding='utf-8') as f:
-# gettext = f.read()
-# batchNum=20
-# sentences = re.split(r'[。\n]', gettext)
-# # 去掉空字符
-# sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
-# # 计算总字符数
-# total_chars = len(sentences)
-#
-# # 计算有多少份
-# num_chunks = math.ceil(total_chars / batchNum)
-#
-# # 按batchNum字为一份进行处理
-# chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
-#
-# # 打印每一份的内容
-# for i, chunk in enumerate(chunks):
-# print(f"Chunk {i + 1}:")
-# print(chunk)
-# print("-" * 40)
-#
-# # 打印总份数
-# print(f"Total chunks: {num_chunks}")
-# app.run(debug=True,port=80)
\ No newline at end of file