文件优化

10 months ago · a1ea54d7f9
9 changed files with 651 additions and 436 deletions
--- a/UserQueue.py
+++ b/UserQueue.py
--- a/checkCompanyName.py
+++ b/checkCompanyName.py
@ -1,14 +1,15 @@
 # -*- coding:utf-8 -*-
 import time
 from docx import Document
 from paddlenlp import Taskflow
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 import json
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
-
+import requests
 from myLogger import outLog
 import time
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
@ -28,43 +29,10 @@ def load_from_xml_v2(baseURI, rels_item_xml):
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
-logger = logging.getLogger("checkCompanyName")
+outLog.logger = logging.getLogger("checkCompanyName")
 userLog=None
 prompt ='''
 .根据上述文本判断，是否为具体的公司或组织名称，你可以使用工具利用互联网查询，
 你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校，行业类型，其他]选项中选择答案,
@ -81,14 +49,23 @@ bot = Assistant(llm=llm_cfg,
                # system_message="你是一个地理专家，可以准确的判断地理位置，如果你不确定，可以使用工具"
                )
 def getDocxToTextAll(name):
    docxPath = name
    loopCount = 0
    while True:
        loopCount+=1
        if(loopCount>=15):
            raise Exception("文档读取超时，或文档存在问题无法读取")
            break
        try:
            document = Document(docxPath)
            break
        except Exception as e:
            time.sleep(1)
            pass
    # 逐段读取docx文档的内容
    levelList=[]
    words = []
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
@ -99,13 +76,15 @@ def getDocxToTextAll(name):
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    text = '\n'.join(words)
-
+    # userLog.info("checkCompanyName----保存文件")
    # 将文本写入txt文件
    with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 def companyNameTask(text):
    yield "文档公司或组织名称检查---启动中...."
-    wordtag  = Taskflow("knowledge_mining",device_id=0)
+    userLog.info("checkCompanyName----启动中....")
    batchNum = 20
    sentences = re.split(r'[。\n]', text)
    # 去掉空字符
@ -122,46 +101,64 @@ def companyNameTask(text):
    # 打印每一份的内容
    for i, chunk in enumerate(chunks):
        yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}"
-
+        userLog.info(f"checkCompanyName----文档解析进度:{i + 1}/{num_chunks}")
        wenBen=".".join(chunk)
        try:
-            res = wordtag(wenBen)
+            wenBen = ".".join(chunk)
            url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
            headers = {"Content-Type": "application/json"}
            data = {
                "data": {
                    "text": wenBen,
                }
            }
            r = requests.post(url=url, headers=headers, data=json.dumps(data))
            res = json.loads(r.text)
            # userLog.info(res)
            # print(res)
        except Exception as e:
-            logging.warning(chunk)
+            userLog.warning(chunk)
-            logging.warning("文档公司或组织名称检查---词类分析出错",e)
+            userLog.warning("文档公司或组织名称检查--错别字识别出错\n")
-            continue
+            userLog.warning(e)
            return
        isplace = False
-        for zuhe in res[0]['items']:
+        for zuhe in res["result"]:
            # 上一个的地名,这一个还是地名，就和上一个相加代替这个
            zhi = zuhe.get("wordtag_label")
            if isplace:
                name = placeList[len(placeList) - 1]
-                if zhi.find("组织机构类") >= 0:  # or zuhe[1] == "ns"
+                if zuhe[1].find("组织机构类") >= 0:  # or zuhe[1] == "ns"
                    isplace = True
-                    new_text = zuhe['item'].replace("\n", "")
+                    new_text = zuhe[0].replace("\n", "")
                    placeList[len(placeList) - 1] = name + new_text
                    continue
-            if zhi.find("组织机构类") >= 0:
+            if zuhe[1].find("组织机构类") >= 0:
                isplace = True
-                new_text = zuhe['item'].replace("\n", "")
+                new_text = zuhe[0].replace("\n", "")
                placeList.append(new_text)
            else:
                isplace = False
    # 打印总份数
    yield "文档公司或组织名称检查---文档解析完成"
    userLog.info("checkCompanyName----文档解析完成")
    placeList = list(dict.fromkeys(placeList))
    yield placeList
-def checkCompanyName(filename):
+    userLog.info(placeList)
 def checkCompanyName(filename,user_id):
    yield f"文档公司或组织名称检查---开始处理文档..."
    global userLog
    userLog=outLog.get_queue(user_id, "checkCompanyName")
    try:
        getDocxToTextAll(filename)
    except Exception as e:
-        logging.warning(e)
+        userLog.warning(e)
        userLog.warning("文档公司或组织名称检查---文档无法打开，请检查文档内容")
        yield "文档公司或组织名称检查---文档无法打开，请检查文档内容"
        outLog.mark_done(user_id, "checkCompanyName")
        return
    with open("checkCompanyName.txt", "r", encoding='utf-8') as f:
        gettext = f.read()
    yield f"文档公司或组织名称检查---开始解析文档..."  # 每次生成一个数字就发送
    userLog.info("checkCompanyName----开始解析文档...")
    for item in companyNameTask(gettext):
        if isinstance(item, str):
            yield item
@ -177,19 +174,22 @@ def checkCompanyName(filename):
        if cishu > 3:
            cishu = 0
        yield "文档公司或组织名称检查---结果生成中" + '.' * cishu
        userLog.info(f"checkCompanyName----结果生成中" + '.' * cishu)
        cishu += 1
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
    error_places = []
    for place in parsed_data:
        try:
            if place['回答'] == '非泛化的公司或组织名称':
                error_places.append(place)
        except Exception as e:
-            logging.warning(place)
+            userLog.warning(place)
-            logging.warning("文档公司或组织名称检查---组织提出出错",e)
+            userLog.warning(e)
            userLog.warning("文档公司或组织名称检查---组织提出出错")
            continue
-    logging.info(error_places)
+    userLog.info(error_places)
    returnInfo = "发现异常公司或组织名称<br>"
    if len(error_places) > 0:
        for t in error_places:
@ -199,7 +199,9 @@ def checkCompanyName(filename):
            t["yuanwen"] = paragraphs[0]
            yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n", "")
            returnInfo += "原文：" + yuanwen + "<br>异常公司或组织名称：**" + keyword + "**！请注意" + "<br>"
-        logging.info(returnInfo)
+            userLog.info(returnInfo)
        yield returnInfo
    else:
        yield "**未发现异常公司或组织名称**<br>"
        userLog.info("**未发现异常公司或组织名称**<br>")
    outLog.mark_done(user_id, "checkCompanyName")
--- a/checkDocumentError.py
+++ b/checkDocumentError.py
@ -1,19 +1,15 @@
 # -*- coding:utf-8 -*-
 # from pycorrector import MacBertCorrector
 # m = MacBertCorrector("shibing624/macbert4csc-base-chinese")
 from qwen_agent.agents import Assistant
 from docx import Document
 from pprint import pprint
 import re
 from paddlenlp import Taskflow
 import json
 import time
 import json_repair
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
-
+import requests
-import asyncio
+from myLogger import outLog
 import time
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
@ -32,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
-logging.config.dictConfig(log_config)
+outLog.logger = logging.getLogger("checkDocumentError")
-
+userLog=None
 logger = logging.getLogger("checkDocumentError")
 llm_cfg = {
    # 'model': 'qwen1.5-72b-chat',
    'model': "qwen2-72b",
@ -83,20 +47,28 @@ bot = Assistant(llm=llm_cfg,
 # 回答格式[{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 # '''
 prompt = '''
-请回答以上问题，[是，否]选项中选择答案,原文内容，标点符号保持不变，如果有错请给出解析，没有错则不用给解析
+请回答以上问题，[是，否]选项中选择答案,原文内容，标点符号保持不变，如果有错请给出详细的解析，没有错则不用给解析
 回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}]，不做过多的解释,严格按回答格式作答;
 '''
 def getDocxToTextAll(name):
    userLog.info("checkDocumentError----打开文档")
    docxPath = name
    loopCount = 0
    while True:
        loopCount+=1
        if(loopCount>=15):
            raise Exception("文档读取超时，或文档存在问题无法读取")
            break
        try:
            document = Document(docxPath)
            break
        except Exception as e:
            time.sleep(1)
            pass
    # 逐段读取docx文档的内容
    levelList = []
    words = []
    addStart = False
    levelText = ""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
@ -112,17 +84,23 @@ def getDocxToTextAll(name):
        txt_file.write(text)
-def getDocumentError(filename):
+def checkDocumentError(filename,user_id):
    global userLog
    userLog=outLog.get_queue(user_id,"checkDocumentError")
    yield f"文档纠错---开始处理文档..."
    userLog.info("checkDocumentError----开始处理文档...")
    try:
        getDocxToTextAll(filename)
    except Exception as e:
-        logger.warning(e)
+        userLog.warning(e)
-        yield "文档无法打开，请检查文档内容"
+        userLog.warning("文档纠错----文档无法打开，请检查文档内容")
        yield "文档纠错----文档无法打开，请检查文档内容"
        outLog.mark_done(user_id, "checkDocumentError")
        return
    with open("checkDocumentError.txt", "r", encoding='utf-8') as f:
        gettext = f.read()
    yield f"文档纠错---开始解析文档..."  # 每次生成一个数字就发送
    userLog.info("checkDocumentError----开始解析文档...")
    final_list = []
    for item in documentErrorTask(gettext):
        if isinstance(item, str):
@ -135,10 +113,13 @@ def getDocumentError(filename):
            yuanwen = i["placeName"].replace("\n", "")
            jianyi = i["jianyi"].replace("\n", "")
            resInfo += "原文：" + yuanwen + "<br>建议：**" + jianyi + "**<br>"
            userLog.info(resInfo)
        yield resInfo
-        logger.info(resInfo)
+
    else:
        yield "**未发现错别字**"
        userLog.info("未发现错别字")
    outLog.mark_done(user_id,"checkDocumentError")
 def documentErrorTask(text):
@ -149,7 +130,7 @@ def documentErrorTask(text):
    :return: 生成器，每次返回一批文本
    """
    yield "文档纠错---启动中...."
-    corrector = Taskflow("text_correction", device_id=1)
+    userLog.info("checkDocumentError----启动中....")
    batchNum = 20
    sentences = re.split(r'[。\n]', text)
    # 去掉空字符
@ -162,18 +143,27 @@ def documentErrorTask(text):
    # 按batchNum字为一份进行处理
    chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
    placeList = []
    # 打印每一份的内容
    err = []
    for i, chunk in enumerate(chunks):
        yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"
        userLog.info(f"checkDocumentError----文档解析进度:{i + 1}/{num_chunks}")
        try:
-            res = corrector(chunk)
+            url = "http://0.0.0.0:8190/taskflow/checkDocumentError"
            headers = {"Content-Type": "application/json"}
            data = {
                "data": {
                    "text": chunk,
                }
            }
            r = requests.post(url=url, headers=headers, data=json.dumps(data))
            res = json.loads(r.text)
            # print(res)
        except Exception as e:
-            logger.warning(chunk)
+            userLog.warning(chunk)
-            logger.warning("文档纠错--错别字识别出错\n", e)
+            userLog.warning("文档纠错--错别字识别出错\n", e)
            continue
-        lines_with_greeting = [place for place in res if len(place['errors']) > 0]
+        lines_with_greeting = [place for place in res["result"] if len(place['errors']) > 0]
        if len(lines_with_greeting) > 0:
            num = 0
            wenti = []  # 记录问题的数组
@ -186,18 +176,20 @@ def documentErrorTask(text):
                    for key, value in item['correction'].items():
                        temp_errorWords.append(key)
                wenti.append(
-                    "{}、原文：{}。问题：【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
+                    "序号：{}，原文：{}。问题：【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
                num += 1
            words = "\n".join(wenti)
            messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]
            runList = []
            yield f"文档纠错---内容解析中..."  # 每次生成一个数字就发送
            userLog.info(f"checkDocumentError----内容解析中...")
            cishu = 0
            for rsp in bot.run(messages):
                runList.append(rsp)
                if cishu > 3:
                    cishu = 0
                yield "文档纠错---内容解析中" + '.' * cishu
                userLog.info(f"checkDocumentError----内容解析中内容解析中" + '.' * cishu)
                cishu += 1
            data = runList[len(runList) - 1][0]["content"]
            parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))
@ -209,12 +201,13 @@ def documentErrorTask(text):
                        place["jianyi"] = place["解析"]
                        resListerr.append(place)
                except Exception as e:
-                    logger.warning(parsed_data)
+                    userLog.warning(parsed_data)
-                    logger.warning(place)
+                    userLog.warning(place)
-                    logger.warning("文档纠错--错别字提取出错\n", e)
+                    userLog.warning("文档纠错--错别字提取出错\n", e)
                    continue
            if (len(resListerr) > 0):
                err.extend(resListerr)
    # 打印总份数
    yield "文档地名检查---文档解析完成"
    userLog.info(err)
    yield err
--- a/checkPlaceName.py
+++ b/checkPlaceName.py
@ -1,15 +1,15 @@
 from docx import Document
 from paddlenlp import Taskflow
 from pprint import pprint
 from qwen_agent.agents import Assistant
 import re
 import json_repair
-import time
+import json
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
-
+import requests
-
+import logging
 from myLogger import outLog
 import time
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
@ -29,45 +29,10 @@ def load_from_xml_v2(baseURI, rels_item_xml):
 _SerializedRelationships.load_from_xml = load_from_xml_v2
-import logging
+outLog.logger = logging.getLogger("checkPlaceName")
-import logging.config
+userLog=None
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 logger = logging.getLogger("checkPlaceName")
 prompt='''
-.上述文本判断地名是否正确，你可以使用工具利用互联网查询，你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
+.上述文本判断地名是否正确，你可以使用工具利用互联网查询，你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 不做过多的解释,严格按回答格式作答;
 '''
 # prompt='''
@ -87,7 +52,18 @@ bot = Assistant(llm=llm_cfg,
                )
 #获取全文内容
 def getDocxToTextAll(docxPath):
    loopCount = 0
    while True:
        loopCount+=1
        if(loopCount>=15):
            raise Exception("文档读取超时，或文档存在问题无法读取")
            break
        try:
            document = Document(docxPath)
            break
        except Exception as e:
            time.sleep(1)
            pass
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
@ -111,7 +87,7 @@ def getDocxToTextAll(docxPath):
 #得到全文和地名有关的内容
 def placeNameTask(text):
    yield "文档地名检查---启动中...."
-    tagTask = Taskflow("ner",device_id=2)
+    userLog.info("checkPlaceName----启动中....")
    batchNum=20
    sentences = re.split(r'[。\n]', text)
    # 去掉空字符
@ -128,16 +104,25 @@ def placeNameTask(text):
    # 打印每一份的内容
    for i, chunk in enumerate(chunks):
        yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
-
+        userLog.info(f"checkPlaceName----文档解析进度:{i + 1}/{num_chunks}")
        wenBen=".".join(chunk)
        try:
-            res = tagTask(wenBen)
+            url = "http://0.0.0.0:8191/taskflow/checkPlaceName"
            headers = {"Content-Type": "application/json"}
            data = {
                "data": {
                    "text": wenBen,
                }
            }
            r = requests.post(url=url, headers=headers, data=json.dumps(data))
            res = json.loads(r.text)
        except Exception as e:
-            logger.warning(chunk)
+            userLog.warning(chunk)
-            logger.warning("文档地名检查---解析地名出错",e)
+            userLog.warning("文档地名检查---解析地名出错")
            userLog.warning(e)
            continue
        isplace = False
-        for zuhe in res:
+        for zuhe in res["result"]:
            # 上一个的地名,这一个还是地名，就和上一个相加代替这个
            if isplace:
                name = placeList[len(placeList) - 1]
@ -154,16 +139,22 @@ def placeNameTask(text):
                isplace = False
    # 打印总份数
    yield "文档地名检查---文档解析完成"
    userLog.info("checkPlaceName---文档解析完成")
    placeList=list(dict.fromkeys(placeList))
    yield placeList
 #主方法
-def checkPlaceName(filename):
+def checkPlaceName(filename,user_id):
    global userLog
    userLog=outLog.get_queue(user_id,"checkPlaceName")
    yield f"文档地名检查---开始处理文档..."  # 每次生成一个数字就发送
    try:
        getDocxToTextAll(filename)
    except Exception as e:
-        logger.warning(e)
+        userLog.warning(e)
        yield "文档地名检查---文档无法打开，请检查文档内容"
        userLog.warning("文档地名检查---文档无法打开，请检查文档内容")
        outLog.mark_done(user_id,"checkPlaceName")
        return
    with open("checkPlaceName.txt", "r",encoding='utf-8') as f:
        gettext = f.read()
@ -184,6 +175,7 @@ def checkPlaceName(filename):
        if cishu>3:
            cishu=0
        yield "文档地名检查---结果生成中"+'.'*cishu
        userLog.info("checkPlaceName---结果生成中"+'.'*cishu)
        cishu+=1
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
@ -194,10 +186,12 @@ def checkPlaceName(filename):
            if place['回答'] == '错误':
                error_places.append(place)
        except Exception as e:
-            logger.warning(place)
+            userLog.warning(parsed_data)
-            logger.warning("文档地名检查---组织提出出错",e)
+            userLog.warning(place)
            userLog.warning("文档地名检查---组织提出出错")
            userLog.warning(e)
            continue
-    logger.info(error_places)
+    userLog.info(error_places)
    returnInfo = "发现异常地名<br>"
    if len(error_places)>0:
        for t in error_places:
@ -206,7 +200,9 @@ def checkPlaceName(filename):
            paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
            yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
            returnInfo+="原文：" + yuanwen + "<br>出现异常地名：**" + keyword + "**！请注意" + "<br>"
            userLog.info(returnInfo)
        yield returnInfo
        logger.info(returnInfo)
    else:
        yield "**未发现发现异常地名**"
        userLog.info("未发现发现异常地名")
        outLog.mark_done(user_id, "checkPlaceName")
--- a/checkRepeatText.py
+++ b/checkRepeatText.py
@ -5,7 +5,7 @@ from langchain_community.document_loaders import TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from qwen_agent.agents import Assistant
 import json_repair
-from paddlenlp import Taskflow
+import json
 embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
 device_id=0
 import re
@ -16,41 +16,11 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 import logging
 import logging.config
 import requests
 from myLogger import outLog
-log_config = {
+outLog.logger = logging.getLogger("checkRepeatText")
-    'version': 1,
+userLog=None
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 logger = logging.getLogger("checkRepeatText")
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
@ -110,7 +80,18 @@ def isTitle(paragraph):
 #寻找标题名称
 def findTitleName(docxPath):
    yield '文档相似性检查----检查是否存在详细设计方案'
    loopCount = 0
    while True:
        loopCount+=1
        if(loopCount>=15):
            raise Exception("文档读取超时，或文档存在问题无法读取")
            break
        try:
            document = Document(docxPath)
            break
        except Exception as e:
            time.sleep(1)
            pass
    # 逐段读取docx文档的内容
    titleWords=[]
    firstTitle = 0
@ -161,14 +142,24 @@ def findTitleName(docxPath):
        runList.append(rsp)
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
    logger.info(parsed_data)
    if(parsed_data["answer"]=="存在"):
        yield parsed_data["name"]
    else:
        yield "文档相似性检查----未找到与详细设计方案相关内容，无法进行相似性比较"
 #获取文档中 详细设计方案 章节的所有内容
 def getDocxToText(docxPath,titleName,vector_store_path):
    loopCount = 0
    while True:
        loopCount+=1
        if(loopCount>=15):
            raise Exception("文档读取超时，或文档存在问题无法读取")
            break
        try:
            document = Document(docxPath)
            break
        except Exception as e:
            time.sleep(1)
            pass
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
@ -228,7 +219,9 @@ def getDocxToText(docxPath,titleName,vector_store_path):
 # @app.route('/checkRepeatText/<filename>', methods=['GET'])
-def checkRepeatText(filename):
+def checkRepeatText(filename,user_id):
    global userLog
    userLog=outLog.get_queue(user_id,"checkRepeatText")
    yield "文档相似性检查---启动中...."
    vector_store_path="vector_store"+str(uuid.uuid4())
    for titleName in findTitleName(filename):
@ -239,13 +232,11 @@ def checkRepeatText(filename):
            words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path)
        except Exception as e:
            yield f"文档相似性检查----文档内容获取失败，未找到**{titleName}**相关内容或文档打开失败"
            userLog.warning(e)
            userLog.warning(f"文档相似性检查----文档内容获取失败，未找到**{titleName}**相关内容或文档打开失败")
            outLog.mark_done(user_id, "checkRepeatText")
            return
    # 记录程序开始的时间戳‘
        global device_id
        similarity = Taskflow("text_similarity",device_id=3)
        # device_id+=1
        # if(device_id>1):
        #     device_id=0
        reslist = []
        count = 0
        for i in words:
@ -259,12 +250,23 @@ def checkRepeatText(filename):
                if (textTag.find(tag) >= 0):
                    continue
                try:
-                    res = similarity([[i[i.find('：') + 1:], text[text.find('：') + 1:]]])
+                    url = "http://0.0.0.0:8192/taskflow/checkRepeatText"
                    headers = {"Content-Type": "application/json"}
                    data = {
                        "data": {
                            "text": [[i[i.find('：') + 1:], text[text.find('：') + 1:]]],
                        }
                    }
                    r = requests.post(url=url, headers=headers, data=json.dumps(data))
                    res = json.loads(r.text)
                    # res = similarity([[i[i.find('：') + 1:], text[text.find('：') + 1:]]])
                except Exception as e:
-                    logger.warning("文档相似性检查--发生异常:",e)
+                    userLog.warning("文档相似性检查--发生异常:")
-                    logger.warning(i)
+                    userLog.warning(e)
-                    logger.warning(text)
+                    userLog.warning(i)
-                if (res[0]["similarity"] > 0.90):
+                    userLog.warning(text)
                    continue
                if (res["result"][0]["similarity"] > 0.90):
                    # 判断重复内容是否被放入
                    if (len(reslist) > 0):
                        isExist = False
@ -274,19 +276,20 @@ def checkRepeatText(filename):
                                break
                        if not isExist:
                            # reslist.append({"yuanwen1":i[i.find('：') + 1:],"yuanwen2":text[text.find('：') + 1:],"similarity":res[0]["similarity"]})
-                            reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
+                            userLog.info("【在"+i[:i.find('：')].replace("\n","")+"下包含："+i[i.find('：') + 1:].replace("\n","")+"<br>在"+text[:text.find('：')].replace("\n","")+"**下包含："+text[text.find('：') + 1:].replace("\n","")+"<br>以上两段内容相似度："+'{:.2f}'.format(res["result"][0]["similarity"])+"】")
                            reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
                    else:
-                        reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
+                        reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res["result"][0]["similarity"]})
                        # print(i.split("：")[1] + "\n" + text.split("：")[1])
                        userLog.info("【在"+i[:i.find('：')].replace("\n","")+"下包含："+i[i.find('：') + 1:].replace("\n","")+"<br>在"+text[:text.find('：')].replace("\n","")+"**下包含："+text[text.find('：') + 1:].replace("\n","")+"<br>以上两段内容相似度："+'{:.2f}'.format(res["result"][0]["similarity"])+"】")
        # vectorstore.delete(ids=uuids)
        shutil.rmtree(vector_store_path)
        logger.info("已删除")
        logger.info(reslist)
        resInfo=f"对{titleName}章节，发现相似内容：<br>"
        if(len(reslist)>0):
            for res in reslist:
                resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find('：')]+"**下包含："+res["yuanwen1"][res["yuanwen1"].find('：') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find('：')]+"**下包含："+res["yuanwen2"][res["yuanwen2"].find('：') + 1:]+"<br>以上两段内容***相似度***："+'{:.2f}'.format(res['similarity'])+"】<br>"
            yield resInfo
            logger.info(resInfo)
        else:
-            yield "未发现相似内容"
+            yield "**未发现相似内容**"
            userLog.info("文档相似性检查----未发现相似内容**")
        outLog.mark_done(user_id, "checkRepeatText")
--- a/checkTitleName.py
+++ b/checkTitleName.py
@ -1,3 +1,5 @@
 import time
 from docx import Document
 from pprint import pprint
 from qwen_agent.agents import Assistant
@ -6,7 +8,7 @@ import json_repair
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
-
+from myLogger import outLog
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
@ -26,41 +28,9 @@ def load_from_xml_v2(baseURI, rels_item_xml):
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
-logging.config.dictConfig(log_config)
+outLog.logger = logging.getLogger("checkTitleName")
-
+userLog=None
 logger = logging.getLogger("checkCompanyName")
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b-instruct",
@ -113,7 +83,18 @@ def isTitle(paragraph):
 #获取文档中 详细设计方案 章节的所有内容
 def getDocxToTitleName(docxPath):
    loopCount = 0
    while True:
        loopCount+=1
        if(loopCount>=15):
            raise Exception("文档读取超时，或文档存在问题无法读取")
            break
        try:
            document = Document(docxPath)
            break
        except Exception as e:
            time.sleep(1)
            pass
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
@ -130,9 +111,11 @@ def getDocxToTitleName(docxPath):
                words.append(text)
    return words
-def checkTitleName(filename):
+def checkTitleName(filename,user_id):
-
+    global userLog
    userLog=outLog.get_queue(user_id,"checkTitleName")
    yield '文档结构检查----启动中'
    userLog.info("checkTitleName----启动中")
    with open("ce模板.txt", "r",encoding='utf-8') as f:
        gettext = f.readlines()
    count=0
@ -140,8 +123,10 @@ def checkTitleName(filename):
    try:
        word = getDocxToTitleName(filename)
    except Exception as e:
-        print(e)
+        userLog.warning(e)
-        yield "文档无法打开，请检查文档内容"
+        yield "文档结构检查----文档无法打开，请检查文档内容"
        outLog.mark_done(user_id, "checkTitleName")
        userLog.warning("checkTitleName----文档无法打开，请检查文档内容")
        return
    for text in gettext:
        count+=1
@ -150,24 +135,25 @@ def checkTitleName(filename):
        '''
        xushang="回答格式{‘name’:‘名称’,'answer'：‘回答’，“标题”：“标题”}请严格按照格式回答问题，不要做过多我解释"
        yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
        userLog.info(f"checkTitleName----结构分析中{count}/{len(gettext)}")
        strword = "\n".join(word)+prompt+xushang
        # print(strword)
        messages = [{'role': 'user', 'content': [{'text':strword}]}]
        runList = []
        cishu = 0
        for rsp in bot.run(messages):
            runList.append(rsp)
            # print(rsp)
        data = runList[len(runList) - 1][0]["content"]
        parsed_data = json_repair.loads(data.replace('`', ''))
        print(parsed_data)
        if(parsed_data["answer"]=="不存在"):
            reserr.append(text)
    resInfo="文档结构存在异常：<br>"
    if(len(reserr)>0):
        for i in reserr:
            resInfo+="**"+i.replace('\n','')+"**<br>"
-        logger.info(resInfo)
+            userLog.info(resInfo)
        yield resInfo
    else:
        yield "文档结构未发现异常"
        userLog.info("文档结构未发现异常")
        outLog.mark_done(user_id, "checkTitleName")
--- a/main.py
+++ b/main.py
@ -3,16 +3,19 @@ import os
 from checkPlaceName import checkPlaceName
 from checkRepeatText import checkRepeatText
 from checkCompanyName import checkCompanyName
-from checkDocumentError import getDocumentError
+from checkDocumentError import checkDocumentError
 from checkTitleName import checkTitleName
 from flask_cors import CORS
 import qwen_agenttext
 from myLogger import outLog
 import time
 app = Flask(__name__)
 cros = CORS(app)
 UPLOAD_FOLDER = 'uploads'
 usableTag=[0,0,0,0,0,0,0,0]
 if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)
@app.route('/upload', methods=['POST'])
 def upload_file():
    if 'file' not in request.files:
@ -24,6 +27,8 @@ def upload_file():
        filename = file.filename
        file.save(os.path.join(UPLOAD_FOLDER, filename))
        return jsonify({"message": "File uploaded successfully"}), 200
@app.route('/stream', methods=["GET", "POST"])
 def stream_numbers():
    context = request.args.get('context')
@ -51,21 +56,25 @@ def stream_numbers():
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(qwen_agenttext.getxinx(context), headers=headers)
@app.route('/sse/checkRepeatText', methods=['GET'])
 def checkRepeatTextWeb():
    filename = request.args.get('filename')
    userId = request.args.get("userId")
-    def generate_checkRepeatText(filename):
+    def generate_checkRepeatText(filename,userId):
        id = 0
-        try:
+        for i in checkRepeatText(filename,userId):
            for i in checkRepeatText(filename):
            yield f"id: {id + 1}\n"
            yield f"event: checkRepeatText\n"
            yield f"data: {i}\n\n"  # 发送完成信号
-        except Exception as e:
+        # except Exception as e:
-            yield f"id: {id+1}\n"
+
-            yield f"event: checkRepeatText\n"
+        #     yield f"id: {id+1}\n"
-            yield f"data: **程序出现异常**\n\n"  # 发送完成信号
+        #     yield f"event: checkRepeatText\n"
        #     yield f"data: **程序出现异常**\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
@ -74,19 +83,20 @@ def checkRepeatTextWeb():
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
-    return Response(generate_checkRepeatText(filename), headers=headers)
+    return Response(generate_checkRepeatText(filename,userId), headers=headers)
@app.route('/sse/checkPlaceName', methods=['GET'])
 def checkPlaceNameWebSse():
    filename = request.args.get('filename')
-
+    userId = request.args.get("userId")
-    def generate_checkPlaceName(filename):
+    def generate_checkPlaceName(filename,userId):
        id = 0
-        for i in checkPlaceName(filename):
+        for i in checkPlaceName(filename,userId):
            yield f"id: {id + 1}\n"
            yield f"event: checkPlaceName\n"
            yield f"data: {i}\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
@ -95,14 +105,16 @@ def checkPlaceNameWebSse():
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
-    return Response(generate_checkPlaceName(filename), headers=headers)
+    return Response(generate_checkPlaceName(filename,userId), headers=headers)
@app.route('/sse/checkCompanyName', methods=['GET'])
 def checkCompanyNameWebSse():
    filename = request.args.get('filename')
-
+    userId = request.args.get("userId")
-    def generate_checkCompanyName(filename):
+    def generate_checkCompanyName(filename,userId):
        id = 0
-        for i in checkCompanyName(filename):
+        for i in checkCompanyName(filename,userId):
            yield f"id: {id + 1}\n"
            yield f"event: checkCompanyName\n"
            yield f"data: {i}\n\n"  # 发送完成信号
@ -115,17 +127,18 @@ def checkCompanyNameWebSse():
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
-    return Response(generate_checkCompanyName(filename), headers=headers)
+    return Response(generate_checkCompanyName(filename,userId), headers=headers)
@app.route('/sse/checkDocumentErrorWeb', methods=['GET'])
 def checkDocumentErrorWebSse():
    filename = request.args.get('filename')
-
+    userId = request.args.get("userId")
-    def generate_checkDocumentError(filename):
+    def generate_checkDocumentError(filename,userId):
        id = 0
-        for i in getDocumentError(filename):
+        for i in checkDocumentError(filename,userId):
            yield f"id: {id + 1}\n"
-            yield f"event: getDocumentError\n"
+            yield f"event: checkDocumentError\n"
            yield f"data: {i}\n\n"  # 发送完成信号
    headers = {
@ -136,14 +149,16 @@ def checkDocumentErrorWebSse():
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
-    return Response(generate_checkDocumentError(filename), headers=headers)
+    return Response(generate_checkDocumentError(filename,userId), headers=headers)
@app.route('/sse/checkTitleName', methods=['GET'])
 def checkTitleNameWebSse():
    filename = request.args.get('filename')
-
+    userId = request.args.get("userId")
-    def generate_checkTitleName(filename):
+    def generate_checkTitleName(filename,userId):
        id = 0
-        for i in checkTitleName(filename):
+        for i in checkTitleName(filename,userId):
            yield f"id: {id + 1}\n"
            yield f"event: checkTitleName\n"
            yield f"data: {i}\n\n"  # 发送完成信号
@ -156,6 +171,36 @@ def checkTitleNameWebSse():
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
-    return Response(generate_checkTitleName(filename), headers=headers)
+    return Response(generate_checkTitleName(filename,userId), headers=headers)
@app.route('/sse/getLog', methods=['GET'])
 def getlog():
    userId = request.args.get("userId")
    def generate_getLog(userId):
        time.sleep(1)
        id = 0
        while True:
            if outLog.is_done(userId):
                break
            q = outLog.get_queueData(userId)
            if q:
                id+=1
                text = q.pop(0)
                yield f"id: {id}\n"
                yield f"event: getlog\n"
                yield f"data: {text}\n\n"  # 发送完成信号
        yield f"id: {id}\n"
        yield f"event: getlog\n"
        yield f"data: 任务结束！！！！！\n\n"  # 发送完成信号
        outLog.del_queue(userId)
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
        "X-Accel-Buffering": "no",
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(generate_getLog(userId), headers=headers)
 if __name__ == '__main__':
    app.run(host="0.0.0.0", port=80)
--- a/myLogger.py
+++ b/myLogger.py
@ -0,0 +1,220 @@
 # -*- coding: utf-8 -*-
 """
@author:  bingyl123@163.com
@version: 1.0.0
@file:    OutLog.py
@time:    2023/2/23 20:25
 """
 # import logging
 # import logging.config
 # import re
 # import datetime
 # import queue
 #
 #
 # class OutLog:
 #     _instance = None
 #     logger = None
 #
 #     def __new__(cls):
 #         if cls._instance is None:
 #             cls._instance = super(OutLog, cls).__new__(cls)
 #             cls.logger = logging.getLogger("app")  # 默认logger名称为"app"
 #             cls._instance.queue_dict = {}
 #             cls._instance.done_dict = {}
 #         return cls._instance
 #
 #     def get_queue(self, user_id):
 #         if user_id not in self.queue_dict:
 #             self.queue_dict[user_id] = []
 #             self.done_dict[user_id] = {}  # 初始化为未完成的字典
 #         return self.queue_dict[user_id]
 #
 #     def mark_done(self, user_id, producer_name):
 #         self.done_dict[user_id][producer_name] = True
 #
 #     def is_done(self, user_id):
 #         return all(self.done_dict.get(user_id, {}).values())  # 检查所有生产者是否完成
 #     @staticmethod
 #     def put(item: str, level="INFO"):
 #         dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 #         mq.put(f"{dtf}[{level}]: {item}")
 #
 #     @staticmethod
 #     def debug(item, log=True):
 #         OutLog.put(item, level="DEBUG")
 #         if log:
 #             OutLog._instance.logger.debug(item)
 #
 #     @staticmethod
 #     def info(item, log=True):
 #         OutLog.put(item, level="INFO")
 #         if log:
 #             OutLog._instance.logger.info(item)
 #
 #     @staticmethod
 #     def warning(item, log=True):
 #         OutLog.put(item, level="WARNING")
 #         if log:
 #             OutLog._instance.logger.warning(item)
 #
 #     @staticmethod
 #     def error(item, log=True):
 #         OutLog.put(item, level="ERROR")
 #         if log:
 #             OutLog._instance.logger.error(item)
 #
 #     @staticmethod
 #     def critical(item, log=True):
 #         OutLog.put(item, level="CRITICAL")
 #         if log:
 #             OutLog._instance.logger.critical(item)
 #
 #
 #
 # # 日志配置
 # log_config = {
 #     'version': 1,
 #     'disable_existing_loggers': False,
 #     'formatters': {
 #         'standard': {
 #             'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 #         },
 #     },
 #     'handlers': {
 #         'console': {
 #             'class': 'logging.StreamHandler',
 #             'formatter': 'standard',
 #             'level': logging.INFO,
 #         },
 #         'file': {
 #             'class': 'logging.FileHandler',
 #             'filename': 'Logger.log',
 #             'formatter': 'standard',
 #             'level': logging.WARNING,
 #         },
 #     },
 #     'loggers': {
 #         '': {
 #             'handlers': ['console', 'file'],
 #             'level': logging.WARNING,
 #             'propagate': True,
 #         },
 #     }
 # }
 #
 # logging.config.dictConfig(log_config)
 #
 # outLog = OutLog()  # 获取单例实例
 import logging
 import logging.config
 import datetime
 class OutLog:
    _instance = None
    logger = None
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(OutLog, cls).__new__(cls)
            cls.logger = logging.getLogger("app")  # 默认logger名称为"app"
            cls._instance.queue_dict = {}
            cls._instance.done_dict = {}
        return cls._instance
    def get_queue(self, user_id,producer_name):
        if user_id not in self.queue_dict:
            self.queue_dict[user_id] = []
            self.done_dict[user_id] = {}  # 初始化为未完成的字典
        if user_id not in self.done_dict:
            self.done_dict[user_id][producer_name] = False
        return self.UserLogger(user_id)
    def get_queueData(self, user_id):
        if user_id in self.queue_dict:
           return OutLog._instance.queue_dict[self.user_id]
    def del_queue(self,user_id):
        if self.is_done(user_id):
            del self.queue_dict[user_id]
            del self.done_dict[user_id]
    class UserLogger:
        def __init__(self, user_id):
            self.user_id = user_id
            self.logger = OutLog._instance.logger
        def log(self, item: str, level: str):
            dtf = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            log_entry = f"{dtf}[{level}]: {item}"
            OutLog._instance.queue_dict[self.user_id].append(log_entry)  # 保存到对应用户的队列
            self._log_to_logger(item, level)
        def _log_to_logger(self, item: str, level: str):
            if level == "DEBUG":
                self.logger.debug(item)
            elif level == "INFO":
                self.logger.info(item)
            elif level == "WARNING":
                self.logger.warning(item)
            elif level == "ERROR":
                self.logger.error(item)
            elif level == "CRITICAL":
                self.logger.critical(item)
        def info(self, item: str):
            self.log(item, "INFO")
        def warning(self, item: str):
            self.log(item, "WARNING")
        def debug(self, item: str):
            self.log(item, "DEBUG")
        def error(self, item: str):
            self.log(item, "ERROR")
        def critical(self, item: str):
            self.log(item, "CRITICAL")
    def mark_done(self, user_id, producer_name):
        self.done_dict[user_id][producer_name] = True
    def is_done(self, user_id):
        return all(self.done_dict.get(user_id, {}).values())  # 检查所有生产者是否完成
 # 日志配置
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.WARNING,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.WARNING,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 outLog = OutLog()  # 获取单例实例
--- a/test.py
+++ b/test.py
@ -1,109 +1,79 @@
-import time
+# -*- coding:utf-8 -*-
-import json
+# from spire.doc import *
-import math
+# from spire.doc.common import *
-from flask import Flask,Response,request
+#
-from flask_sse import  sse
+# # 创建一个 Document 对象
-from flask_cors import CORS
+# document = Document()
-import re
+# # 加载一个 Word DOCX 文档
-import qwen_agenttext
+# # document.LoadFromFile("C:\\Users\\gy051\\Desktop\\1223.doc")
-app = Flask(__name__)
+# document.LoadFromFile("D:\\数据集\\数据集\\3.doc")
-cros = CORS(app)
+# print(document.Sections.Count)
-# SSE 推送函数
+# for i in range(document.Sections.Count):
-import paddle;
+#     section=document.Sections[i]
-paddle.device.get_available_device()
+#     for x  in range(section.Paragraphs.Count):
 #         paragraph=section.Paragraphs[x]
 #         print(paragraph.Text)
 #     print("---------------------------------")
 #     # 或加载一个 Word DOC 文档
 # # document.LoadFromFile("1223.xml")
 #
 # # # # 设置是否在 HTML 中嵌入图片
 # # document.HtmlExportOptions.ImageEmbedded = True
 # # # document.XHTMLValidateOption.ImageEmbedded = True
 # # #
 # # # # 设置是否将表单字段导出为纯文本在 HTML 中显示
 # # document.HtmlExportOptions.IsTextInputFormFieldAsText = True
 # # # document.XHTMLValidateOption.IsTextInputFormFieldAsText = True
 # # #
 # # # # 设置是否在 HTML 中导出页眉和页脚
 # # document.HtmlExportOptions.HasHeadersFooters = False
 # # # document.XHTMLValidateOption.HasHeadersFooters = True
 # #
 # # # 将 Word 文档保存为 HTML 文件
 # # document.SaveToFile("1223.html", FileFormat.Html)
 # # #
 # document.Close()
 from bs4 import BeautifulSoup
 # 读取HTML文件
 with open('D:\\models\\1223.html', 'r',encoding="utf-8") as file:
    html_content = file.read()
 # 解析HTML文档
 soup = BeautifulSoup(html_content, 'html.parser')
-# SSE 推送路由
+# 用于存储结果的字典
 headings = {}
 current_heading = None
 # 遍历所有的h1, h2, h3等标题
 for element in soup.find_all(['h1', 'h2', 'h3',"h4","h5","h6"]):
    level = int(element.name[1])  # 获取标题级别
    title = element.get_text(strip=True)  # 获取标题文本
-# @app.route('/register', methods=["GET"])
+    # 设置当前标题
-# def register():
+    current_heading = {
-    # 获取客户端标识符
+        'title': title,
-    # client_id = str(uuid.uuid4())
+        'level': level,
-    #
+        'content': []
-    # # 返回 SSE 响应
+    }
    # return jsonify({"client_id": client_id})
    # 将当前标题添加到字典中
    headings[title] = current_heading
-# SSE 推送路由
+    # 寻找当前标题下的内容
    next_element = element.find_next_sibling()
    while next_element and next_element.name not in ['h1', 'h2', 'h3',"h4","h5","h6"]:
        # 判断内容的标签
        if next_element.name in ['p', 'div']:
            current_heading['content'].append(next_element.get_text(strip=False))
        next_element = next_element.find_next_sibling()
 # 输出结果
 for heading in headings.values():
    print(f"标题: {heading['title']} (级别: {heading['level']})")
    print("内容:")
    for content in heading['content']:
        print(f" - {content}")
    print()
 # @app.route('/sse', methods=['POST'])
 # def stream():
 #     # 获取客户端标识符
 #     client_id = 1
 #     print("client_id", client_id)
 #
 #     def aa():
 #         # 循环发送 SSE 数据
 #         for i in range(10):
 #             data = 'Hello, %s!' % client_id + str(i)
 #             print(data)
 #             sse.publish(data, channel=client_id, type='message')
 #             time.sleep(1)
 #         sse.publish("end", channel=client_id, type='message')
 #
 #     # 返回 SSE 响应
 #     response = Response(aa(), mimetype='text/event-stream')
 #     response.headers.add('Cache-Control', 'no-cache')
 #     response.headers.add('Connection', 'keep-alive')
 #     response.headers.add('X-Accel-Buffering', 'no')
 #     return response
 #
 #
 #
 # @app.route('/stream' ,methods=["GET", "POST"])
 # def stream_numbers():
 #     context= request.args.get('context')
 #
 #
 #     headers = {
 #         "Content-Type": "text/event-stream",
 #         "Cache-Control": "no-cache",
 #         "X-Accel-Buffering": "no",
 #         "Access-Control-Allow-Origin": "*",
 #         "Access-Control-Allow-Methods": "GET,POST",
 #         "Access-Control-Allow-Headers": "x-requested-with,content-type",
 #     }
 #     return Response(generate_numbers(),headers=headers)
 # def generate_numbers():
 #     event_id=0
 #     # for number in range(1, 10):
 #     #     json_data = json.dumps({"number": number})
 #     #     print(json_data)
 #     #     event_id += 1
 #     #     yield f"id: {event_id}\n"
 #     #     yield f"event: time-update\n"
 #     #     yield f"data: {json_data}\n\n"  # 每次生成一个数字就发送
 #     json_data = json.dumps({"number": "done"})
 #     yield f"id: {1}\n"
 #     yield f"event: time-update\n"
 #     yield f"data: 34568\n\n"  # 发送完成信号
 # if __name__ == '__main__':
 #
 #
 #     # 读取文件内容
 #     with open("checkPlaceName.txt", "r", encoding='utf-8') as f:
 #         gettext = f.read()
 #     batchNum=20
 #     sentences = re.split(r'[。\n]', gettext)
 #     # 去掉空字符
 #     sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
 #     # 计算总字符数
 #     total_chars = len(sentences)
 #
 #     # 计算有多少份
 #     num_chunks = math.ceil(total_chars / batchNum)
 #
 #     # 按batchNum字为一份进行处理
 #     chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
 #
 #     # 打印每一份的内容
 #     for i, chunk in enumerate(chunks):
 #         print(f"Chunk {i + 1}:")
 #         print(chunk)
 #         print("-" * 40)
 #
 #     # 打印总份数
 #     print(f"Total chunks: {num_chunks}")
 #     app.run(debug=True,port=80)