first commit

10 months ago · 6639ac75dc
67 changed files with 14904 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
 # 默认忽略的文件
 /shelf/
 /workspace.xml
 # 基于编辑器的 HTTP 客户端请求
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="Encoding">
    <file url="file://$PROJECT_DIR$/ce.txt" charset="GBK" />
  </component>
 </project>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="Black">
    <option name="sdkName" value="Python 3.9 (venv) (2)" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (venv) (2)" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/python项目39.iml" filepath="$PROJECT_DIR$/.idea/python项目39.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/python项目39.iml
+++ b/.idea/python项目39.iml
@ -0,0 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/pycache/baidusearch.cpython-39.pyc
+++ b/pycache/baidusearch.cpython-39.pyc
--- a/pycache/checkCompanyName.cpython-39.pyc
+++ b/pycache/checkCompanyName.cpython-39.pyc
--- a/pycache/checkDocumentError.cpython-39.pyc
+++ b/pycache/checkDocumentError.cpython-39.pyc
--- a/pycache/checkPlaceName.cpython-39.pyc
+++ b/pycache/checkPlaceName.cpython-39.pyc
--- a/pycache/checkRepeatText.cpython-39.pyc
+++ b/pycache/checkRepeatText.cpython-39.pyc
--- a/pycache/json_repair.cpython-39.pyc
+++ b/pycache/json_repair.cpython-39.pyc
--- a/pycache/main.cpython-39.pyc
+++ b/pycache/main.cpython-39.pyc
--- a/pycache/qwen_agenttext.cpython-39.pyc
+++ b/pycache/qwen_agenttext.cpython-39.pyc
--- a/pycache/test.cpython-39.pyc
+++ b/pycache/test.cpython-39.pyc
--- a/baidusearch.py
+++ b/baidusearch.py
@ -0,0 +1,258 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Created by Charles on 2018/10/10
 # Function:
 import sys
 import requests
 from bs4 import BeautifulSoup
 ABSTRACT_MAX_LENGTH = 300    # abstract max length
 user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
    ' Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 '
    '(KHTML, like Gecko)  QtWeb Internet Browser/3.7 http://www.QtWeb.net',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/41.0.2228.0 Safari/537.36',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, '
    'like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) '
    'Gecko/20070404 K-Ninja/2.1.3',
    'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; '
    'en-US) iNet Browser 4.7',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) '
    'Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866'
 ]
 # 请求头信息
 HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Content-Type": "application/x-www-form-urlencoded",
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    "Referer": "https://www.baidu.com/",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9"
 }
 baidu_host_url = "https://www.baidu.com"
 baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&wd="
 session = requests.Session()
 session.headers = HEADERS
 def search(keyword, num_results=10, debug=0):
    """
    通过关键字进行搜索
    :param keyword: 关键字
    :param num_results： 指定返回的结果个数
    :return: 结果列表
    """
    if not keyword:
        return None
    list_result = []
    page = 1
    # 起始搜索的url
    next_url = baidu_search_url + keyword
    # 循环遍历每一页的搜索结果，并返回下一页的url
    while len(list_result) < num_results:
        data, next_url = parse_html(next_url, rank_start=len(list_result))
        if data:
            list_result += data
            if debug:
                print("---searching[{}], finish parsing page {}, results number={}: ".format(keyword, page, len(data)))
                for d in data:
                    print(str(d))
        if not next_url:
            if debug:
                print(u"already search the last page。")
            break
        page += 1
    if debug:
        print("\n---search [{}] finished. total results number={}！".format(keyword, len(list_result)))
    return list_result[: num_results] if len(list_result) > num_results else list_result
 def parse_html(url, rank_start=0, debug=0):
    """
    解析处理结果
    :param url: 需要抓取的 url
    :return:  结果列表，下一页的url
    """
    try:
        res = session.get(url=url)
        res.encoding = "utf-8"
        root = BeautifulSoup(res.text, "lxml")
        list_data = []
        div_contents = root.find("div", id="content_left")
        for div in div_contents.contents:
            if type(div) != type(div_contents):
                continue
            class_list = div.get("class", [])
            if not class_list:
                continue
            if "c-container" not in class_list:
                continue
            title = ''
            url = ''
            abstract = ''
            try:
                # 遍历所有找到的结果，取得标题和概要内容（50字以内）
                if "xpath-log" in class_list:
                    if div.h3:
                        title = div.h3.text.strip()
                        url = div.h3.a['href'].strip()
                    else:
                        title = div.text.strip().split("\n", 1)[0]
                        if div.a:
                            url = div.a['href'].strip()
                    if div.find("div", class_="c-abstract"):
                        abstract = div.find("div", class_="c-abstract").text.strip()
                    elif div.div:
                        abstract = div.div.text.strip()
                    else:
                        abstract = div.text.strip().split("\n", 1)[1].strip()
                elif "result-op" in class_list:
                    if div.h3:
                        title = div.h3.text.strip()
                        url = div.h3.a['href'].strip()
                    else:
                        title = div.text.strip().split("\n", 1)[0]
                        url = div.a['href'].strip()
                    if div.find("div", class_="c-abstract"):
                        abstract = div.find("div", class_="c-abstract").text.strip()
                    elif div.div:
                        abstract = div.div.text.strip()
                    else:
                        # abstract = div.text.strip()
                        abstract = div.text.strip().split("\n", 1)[1].strip()
                else:
                    if div.get("tpl", "") != "se_com_default":
                        if div.get("tpl", "") == "se_st_com_abstract":
                            if len(div.contents) >= 1:
                                title = div.h3.text.strip()
                                if div.find("div", class_="c-abstract"):
                                    abstract = div.find("div", class_="c-abstract").text.strip()
                                elif div.div:
                                    abstract = div.div.text.strip()
                                else:
                                    abstract = div.text.strip()
                        else:
                            if len(div.contents) >= 2:
                                if div.h3:
                                    title = div.h3.text.strip()
                                    url = div.h3.a['href'].strip()
                                else:
                                    title = div.contents[0].text.strip()
                                    url = div.h3.a['href'].strip()
                                # abstract = div.contents[-1].text
                                if div.find("div", class_="c-abstract"):
                                    abstract = div.find("div", class_="c-abstract").text.strip()
                                elif div.div:
                                    abstract = div.div.text.strip()
                                else:
                                    abstract = div.text.strip()
                    else:
                        if div.h3:
                            title = div.h3.text.strip()
                            url = div.h3.a['href'].strip()
                        else:
                            title = div.contents[0].text.strip()
                            url = div.h3.a['href'].strip()
                        if div.find("div", class_="c-abstract"):
                            abstract = div.find("div", class_="c-abstract").text.strip()
                        elif div.div:
                            abstract = div.div.text.strip()
                        else:
                            abstract = div.text.strip()
            except Exception as e:
                if debug:
                    print("catch exception duration parsing page html, e={}".format(e))
                continue
            if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
                abstract = abstract[:ABSTRACT_MAX_LENGTH]
            rank_start+=1
            list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start})
        # 找到下一页按钮
        next_btn = root.find_all("a", class_="n")
        # 已经是最后一页了，没有下一页了，此时只返回数据不再获取下一页的链接
        if len(next_btn) <= 0 or u"上一页" in next_btn[-1].text:
            return list_data, None
        next_url = baidu_host_url + next_btn[-1]["href"]
        return list_data, next_url
    except Exception as e:
        if debug:
            print(u"catch exception duration parsing page html, e：{}".format(e))
        return None, None
 def run():
    """
    主程序入口，支持命令得带参执行或者手动输入关键字
    :return:
    """
    default_keyword = u"长风破浪小武哥"
    num_results = 10
    debug = 0
    prompt = """
    baidusearch: not enough arguments
    [0]keyword: keyword what you want to search
    [1]num_results: number of results
    [2]debug: debug switch, 0-close, 1-open, default-0
    eg: baidusearch NBA
        baidusearch NBA 6
        baidusearch NBA 8 1
    """
    if len(sys.argv) > 3:
        keyword = sys.argv[1]
        try:
            num_results = int(sys.argv[2])
            debug = int(sys.argv[3])
        except:
            pass
    elif len(sys.argv) > 1:
        keyword = sys.argv[1]
    else:
        print(prompt)
        keyword = input("please input keyword: ")
        # sys.exit(1)
    if not keyword:
        keyword = default_keyword
    print("---start search: [{}], expected number of results:[{}].".format(keyword, num_results))
    results = search(keyword, num_results=num_results, debug=debug)
    if isinstance(results, list):
        print("search results：(total[{}]items.)".format(len(results)))
        for res in results:
            print("{}. {}\n   {}\n   {}".format(res['rank'], res["title"], res["abstract"], res["url"]))
    else:
        print("start search: [{}] failed.".format(keyword))
 if __name__ == '__main__':
    run()
--- a/cewenj.py
+++ b/cewenj.py
@ -0,0 +1,64 @@
 from qwen_agent.agents import Assistant
 # from qwen_agent.agents.doc_qa import ParallelDocQA
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'
                )
 prompt='''
 请找是描述项目建设的章节名称
 '''
 messages = [{'role': 'user', 'content': [{'text': prompt}, {'file': ''}]}]
 for rsp in bot.run(messages):
    print(rsp)
 # messages = [{'role': 'user', 'content': [{'text':prompt}]}]
 # runList=[]
 # for rsp in bot.run(messages):
 #     print(rsp)
 import re
 # from docx import Document
 #
 # document = Document('747991ddb29a49da903210959076bb9f.docx')
 # # 逐段读取docx文档的内容
 # levelList = []
 # words = []
 # addStart = False
 # levelText = ""
 # i = 0
 # for paragraph in document.paragraphs:
 #     # 判断该段落的标题级别
 #     # 这里用isTitle()临时代表，具体见下文介绍的方法
 #     text = paragraph.text
 #     if text.strip():  # 非空判断
 #         # print("非空")
 #         words.append(text)
 #         # level = isTitle(paragraph)
 #         # if(addStart and level=="0"):
 #         #     addStart=False
 #         # if(level=="0" and text.find("详细设计方案")>=0):
 #         #     addStart=True
 #         # if level:
 #         #     levelList.append("{}：".format(level)+paragraph.text)
 #         #     levelText=text
 #         # else:
 #         #     if addStart:
 #         #         if(text.startswith("图") or text.startswith("注：")):
 #         #             continue
 #         #         i=i+1
 #         #         words.append("第{}个段落：".format(i)+text)
 #
 # # 将所有段落文本拼接成一个字符串，并用换行符分隔
 # print(len(words))
 # text = '\n'.join(words)
 # paragraphs = re.findall(r'.*?' + re.escape('宁波市') + r'.*?\n', text)
 # print(paragraphs)
 from langchain_community.document_loaders import TextLoader
 loader = TextLoader('checkRepeatText.txt')
 docs = loader.load()
--- a/checkCompanyName.py
+++ b/checkCompanyName.py
@ -0,0 +1,205 @@
 # -*- coding:utf-8 -*-
 import time
 from docx import  Document
 from paddlenlp import Taskflow
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
    relationships contained in *rels_item_xml*. Returns an empty
    collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ('../NULL', 'NULL'):
                continue
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 logger = logging.getLogger("checkCompanyName")
 prompt = '''
 .根据上述文本判断，是否为具体的公司或组织名称，你可以使用工具利用互联网查询，
 你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校，行业类型，其他]选项中选择答案,
 回答格式[{“companyName”：“名称”,"回答":"答案"}，{“companyName”：“名称”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 '''
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                # system_message="你是一个地理专家，可以准确的判断地理位置，如果你不确定，可以使用工具"
                )
 def getDocxToTextAll(name):
    docxPath=name
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            # print("非空")
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 def companyNameTask(text):
    yield "文档公司或组织名称检查---启动中...."
    wordtag  = Taskflow("knowledge_mining",device_id=0)
    batchNum=20
    sentences = re.split(r'[。\n]', text)
    # 去掉空字符
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # 计算总字符数
    total_chars = len(sentences)
    # 计算有多少份
    num_chunks = math.ceil(total_chars / batchNum)
    # 按batchNum字为一份进行处理
    chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
    placeList = []
    # 打印每一份的内容
    for i, chunk in enumerate(chunks):
        yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}"
        wenBen=".".join(chunk)
        try:
            res = wordtag(wenBen)
        except Exception as e:
            logging.warning(chunk)
            logging.warning("文档公司或组织名称检查---词类分析出错",e)
            continue
        isplace = False
        for zuhe in res[0]['items']:
            # 上一个的地名,这一个还是地名，就和上一个相加代替这个
            zhi = zuhe.get("wordtag_label")
            if isplace:
                name = placeList[len(placeList) - 1]
                if zhi.find("组织机构类") >= 0:  # or zuhe[1] == "ns"
                    isplace = True
                    new_text = zuhe['item'].replace("\n", "")
                    placeList[len(placeList) - 1] = name + new_text
                    continue
            if zhi.find("组织机构类") >= 0:
                isplace = True
                new_text = zuhe['item'].replace("\n", "")
                placeList.append(new_text)
            else:
                isplace = False
    # 打印总份数
    yield "文档公司或组织名称检查---文档解析完成"
    placeList=list(dict.fromkeys(placeList))
    yield placeList
 def checkCompanyName(filename):
    yield f"文档公司或组织名称检查---开始处理文档..."
    try:
        getDocxToTextAll(filename)
    except Exception as e:
        logging.warning(e)
        yield "文档公司或组织名称检查---文档无法打开，请检查文档内容"
        return
    with open("checkCompanyName.txt", "r", encoding='utf-8') as f:
        gettext = f.read()
    yield f"文档公司或组织名称检查---开始解析文档..."  # 每次生成一个数字就发送
    for item in companyNameTask(gettext):
        if isinstance(item, str):
            yield item
        else:
            final_list = item  # 获取最终结果
    propnStr = ",".join(final_list)
    messages = [{'role': 'user', 'content': [{'text': propnStr+prompt}]}]
    runList = []
    yield f"文档公司或组织名称检查---结果生成中..."  # 每次生成一个数字就发送
    cishu = 0
    for rsp in bot.run(messages):
        runList.append(rsp)
        if cishu > 3:
            cishu = 0
        yield "文档公司或组织名称检查---结果生成中" + '.' * cishu
        cishu += 1
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
    error_places=[]
    for place in parsed_data:
        try:
            if place['回答'] == '非泛化的公司或组织名称':
                error_places.append(place)
        except Exception as e:
            logging.warning(place)
            logging.warning("文档公司或组织名称检查---组织提出出错",e)
            continue
    logging.info(error_places)
    returnInfo = "发现异常公司或组织名称<br>"
    if len(error_places)>0:
        for t in error_places:
            keyword= t['companyName'].replace("\n","")
        # 查找包含关键字的段落
            paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
            t["yuanwen"]=paragraphs[0]
            yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n","")
            returnInfo += "原文：" + yuanwen + "<br>异常公司或组织名称：**" + keyword + "**！请注意" + "<br>"
        logging.info(returnInfo)
        yield returnInfo
    else:
        yield "**未发现异常公司或组织名称**<br>"
--- a/checkCompanyName.txt
+++ b/checkCompanyName.txt
--- a/checkDocumentError.py
+++ b/checkDocumentError.py
@ -0,0 +1,220 @@
 # -*- coding:utf-8 -*-
 # from pycorrector import MacBertCorrector
 # m = MacBertCorrector("shibing624/macbert4csc-base-chinese")
 from qwen_agent.agents import Assistant
 from docx import Document
 from pprint import pprint
 import re
 from paddlenlp import Taskflow
 import json
 import time
 import json_repair
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 import asyncio
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
    relationships contained in *rels_item_xml*. Returns an empty
    collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ('../NULL', 'NULL'):
                continue
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 logger = logging.getLogger("checkDocumentError")
 llm_cfg = {
    # 'model': 'qwen1.5-72b-chat',
    'model': "qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                # description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'
                )
 # prompt='''
 # 是否存在错别字，若存在请指出，不做其他方面的校验，你只能在[存在，不存在，未知]选项中选择答案,
 # 回答格式[{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 # '''
 prompt = '''
 请回答以上问题，[是，否]选项中选择答案,原文内容，标点符号保持不变，如果有错请给出解析，没有错则不用给解析
 回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}]，不做过多的解释,严格按回答格式作答;
 '''
 def getDocxToTextAll(name):
    docxPath = name
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList = []
    words = []
    addStart = False
    levelText = ""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():  # 非空判断
            # print("非空")
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 def getDocumentError(filename):
    yield f"文档纠错---开始处理文档..."
    try:
        getDocxToTextAll(filename)
    except Exception as e:
        logger.warning(e)
        yield "文档无法打开，请检查文档内容"
        return
    with open("checkDocumentError.txt", "r", encoding='utf-8') as f:
        gettext = f.read()
    yield f"文档纠错---开始解析文档..."  # 每次生成一个数字就发送
    final_list = []
    for item in documentErrorTask(gettext):
        if isinstance(item, str):
            yield item
        else:
            final_list = item  # 获取最终结果
    resInfo = "发现错别字<br>"
    if (len(final_list) > 0):
        for i in final_list:
            yuanwen = i["placeName"].replace("\n", "")
            jianyi = i["jianyi"].replace("\n", "")
            resInfo += "原文：" + yuanwen + "<br>建议：**" + jianyi + "**<br>"
        yield resInfo
        logger.info(resInfo)
    else:
        yield "**未发现错别字**"
 def documentErrorTask(text):
    """
    分批读取文本文件
    :param file_path: 文件路径
    :param batch_size: 每批处理的字符数
    :return: 生成器，每次返回一批文本
    """
    yield "文档纠错---启动中...."
    corrector = Taskflow("text_correction", device_id=1)
    batchNum = 20
    sentences = re.split(r'[。\n]', text)
    # 去掉空字符
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # 计算总字符数
    total_chars = len(sentences)
    # 计算有多少份
    num_chunks = math.ceil(total_chars / batchNum)
    # 按batchNum字为一份进行处理
    chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
    placeList = []
    # 打印每一份的内容
    err = []
    for i, chunk in enumerate(chunks):
        yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"
        try:
            res = corrector(chunk)
        except Exception as e:
            logger.warning(chunk)
            logger.warning("文档纠错--错别字识别出错\n", e)
            continue
        lines_with_greeting = [place for place in res if len(place['errors']) > 0]
        if len(lines_with_greeting) > 0:
            num = 0
            wenti = []  # 记录问题的数组
            keyword_list = []  # 记录问题
            for t in lines_with_greeting:
                temp_errorWords = []
                keyword = t['source']
                keyword_list.append(keyword)
                for item in t["errors"]:
                    for key, value in item['correction'].items():
                        temp_errorWords.append(key)
                wenti.append(
                    "{}、原文：{}。问题：【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
                num += 1
            words = "\n".join(wenti)
            messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]
            runList = []
            yield f"文档纠错---内容解析中..."  # 每次生成一个数字就发送
            cishu = 0
            for rsp in bot.run(messages):
                runList.append(rsp)
                if cishu > 3:
                    cishu = 0
                yield "文档纠错---内容解析中" + '.' * cishu
                cishu += 1
            data = runList[len(runList) - 1][0]["content"]
            parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))
            resListerr = []
            for place in parsed_data:
                try:
                    if place['回答'] == '是':
                        place["placeName"] = keyword_list[int(place["placeName"])]
                        place["jianyi"] = place["解析"]
                        resListerr.append(place)
                except Exception as e:
                    logger.warning(parsed_data)
                    logger.warning(place)
                    logger.warning("文档纠错--错别字提取出错\n", e)
                    continue
            if (len(resListerr) > 0):
                err.extend(resListerr)
    # 打印总份数
    yield "文档地名检查---文档解析完成"
    yield err
--- a/checkPlaceName.py
+++ b/checkPlaceName.py
@ -0,0 +1,212 @@
 from docx import Document
 from paddlenlp import Taskflow
 from pprint import pprint
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 import time
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
    relationships contained in *rels_item_xml*. Returns an empty
    collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ('../NULL', 'NULL'):
                continue
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 logger = logging.getLogger("checkPlaceName")
 prompt='''
 .上述文本判断地名是否正确，你可以使用工具利用互联网查询，你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 不做过多的解释,严格按回答格式作答;
 '''
 # prompt='''
 # .请回答以上问题，
 # ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 # 不做过多的解释,严格按回答格式作答;
 # '''
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                # description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'
                )
 #获取全文内容
 def getDocxToTextAll(docxPath):
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            # print("非空")
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 #得到全文和地名有关的内容
 def placeNameTask(text):
    yield "文档地名检查---启动中...."
    tagTask = Taskflow("ner",device_id=2)
    batchNum=20
    sentences = re.split(r'[。\n]', text)
    # 去掉空字符
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # 计算总字符数
    total_chars = len(sentences)
    # 计算有多少份
    num_chunks = math.ceil(total_chars / batchNum)
    # 按batchNum字为一份进行处理
    chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
    placeList = []
    # 打印每一份的内容
    for i, chunk in enumerate(chunks):
        yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
        wenBen=".".join(chunk)
        try:
            res = tagTask(wenBen)
        except Exception as e:
            logger.warning(chunk)
            logger.warning("文档地名检查---解析地名出错",e)
            continue
        isplace = False
        for zuhe in res:
            # 上一个的地名,这一个还是地名，就和上一个相加代替这个
            if isplace:
                name = placeList[len(placeList) - 1]
                if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0:  # or zuhe[1] == "ns"
                    isplace = True
                    new_text = zuhe[0].replace("\n", "")
                    placeList[len(placeList) - 1] = name + new_text
                    continue
            if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0:
                isplace = True
                new_text = zuhe[0].replace("\n", "")
                placeList.append(new_text)
            else:
                isplace = False
    # 打印总份数
    yield "文档地名检查---文档解析完成"
    placeList=list(dict.fromkeys(placeList))
    yield placeList
 #主方法
 def checkPlaceName(filename):
    yield f"文档地名检查---开始处理文档..."  # 每次生成一个数字就发送
    try:
        getDocxToTextAll(filename)
    except Exception as e:
        logger.warning(e)
        yield "文档地名检查---文档无法打开，请检查文档内容"
        return
    with open("checkPlaceName.txt", "r",encoding='utf-8') as f:
        gettext = f.read()
    yield f"文档地名检查---开始解析文档..."  # 每次生成一个数字就发送
    # propnList=placeNameTask(gettext)
    for item in placeNameTask(gettext):
        if isinstance(item, str):
            yield item
        else:
            final_list = item  # 获取最终结果
    propnStr = ",".join(final_list)
    messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
    runList = []
    yield f"文档地名检查---结果生成中..."  # 每次生成一个数字就发送
    cishu=0
    for rsp in bot.run(messages):
        runList.append(rsp)
        if cishu>3:
            cishu=0
        yield "文档地名检查---结果生成中"+'.'*cishu
        cishu+=1
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
    error_places=[]
    # 如果需要进一步操作，例如只关注“正确”的回答
    for place in parsed_data:
        try:
            if place['回答'] == '错误':
                error_places.append(place)
        except Exception as e:
            logger.warning(place)
            logger.warning("文档地名检查---组织提出出错",e)
            continue
    logger.info(error_places)
    returnInfo = "发现异常地名<br>"
    if len(error_places)>0:
        for t in error_places:
            keyword= t['placeName'].replace("\n","")
        # 查找包含关键字的段落
            paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
            yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
            returnInfo+="原文：" + yuanwen + "<br>出现异常地名：**" + keyword + "**！请注意" + "<br>"
        yield returnInfo
        logger.info(returnInfo)
    else:
        yield "**未发现发现异常地名**"
--- a/checkRepeatText.py
+++ b/checkRepeatText.py
@ -0,0 +1,292 @@
 import uuid
 from langchain_chroma import Chroma
 from langchain_community.embeddings import DashScopeEmbeddings
 from langchain_community.document_loaders import TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from qwen_agent.agents import Assistant
 import json_repair
 from paddlenlp import Taskflow
 embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
 device_id=0
 import re
 import time
 from docx import Document
 import shutil
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 logger = logging.getLogger("checkRepeatText")
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
    relationships contained in *rels_item_xml*. Returns an empty
    collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ('../NULL', 'NULL'):
                continue
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 # 记录程序开始的时间戳
 def getOutlineLevel(inputXml):
    """
    功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
    参数 inputXml
    返回 number
    """
    start_index = inputXml.find('<w:outlineLvl')
    end_index = inputXml.find('>', start_index)
    number = inputXml[start_index:end_index + 1]
    number = re.search("\d+", number).group()
    return number
 def isTitle(paragraph):
    """
    功能 判断该段落是否设置了大纲等级
    参数 paragraph:段落
    返回 None:普通正文，没有大纲级别 0:一级标题 1:二级标题 2:三级标题
    """
    # 如果是空行，直接返回None
    if paragraph.text.strip() == '':
        return None
    # 如果该段落是直接在段落里设置大纲级别的，根据xml判断大纲级别
    paragraphXml = paragraph._p.xml
    if paragraphXml.find('<w:outlineLvl') >= 0:
        return getOutlineLevel(paragraphXml)
    # 如果该段落是通过样式设置大纲级别的，逐级检索样式及其父样式，判断大纲级别
    targetStyle = paragraph.style
    while targetStyle is not None:
        # 如果在该级style中找到了大纲级别，返回
        if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
            return getOutlineLevel(targetStyle.element.xml)
        else:
            targetStyle = targetStyle.base_style
    # 如果在段落、样式里都没有找到大纲级别，返回None
    return None
 #寻找标题名称
 def findTitleName(docxPath):
    yield '文档相似性检查----检查是否存在详细设计方案'
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    titleWords=[]
    firstTitle = 0
    secondTitle = 0
    sanjiTitle = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            level = isTitle(paragraph)
            if level=="0":
                firstTitle+=1
                secondTitle = 0
                if(text.find("附件")>=0):
                    continue
                titleWords.append("一级标题:".format(firstTitle)+text)
            elif level=="1":
                secondTitle+=1
                sanjiTitle=0
                # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
                # titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
            elif level=="2":
                sanjiTitle += 1
                # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
                # titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
    findTitleName_llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
    }
    findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
                                    name='Assistant',
                                    # system_message='1：这样的是一级标题。1.1：这样的是二级标题。1.1.1：这样的是三级标题'
                                )
    prompt='''\n是文档的大纲，一级标题组成，哪一章存在与方案相关的内容
    类似详细设计方案,详细服务方案，详细建设方案为最相关的，优先选择
    类似设计方案，服务方案，建设方案为次相关，次级选择
    类似方案是最后选择
    按照这样的顺序选择最合适的
    你只能从这两个答案中选择一个：{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"}，不做过多的解释,严格按回答格式作答
    '''
    # print("\n".join(titleWords)+prompt)
    messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
    runList=[]
    for rsp in findTitleName_bot.run(messages):
        runList.append(rsp)
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
    logger.info(parsed_data)
    if(parsed_data["answer"]=="存在"):
        yield parsed_data["name"]
    else:
        yield "文档相似性检查----未找到与详细设计方案相关内容，无法进行相似性比较"
 #获取文档中 详细设计方案 章节的所有内容
 def getDocxToText(docxPath,titleName,vector_store_path):
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            if titleName:
                level = isTitle(paragraph)
                if(addStart and level=="0"):
                    addStart=False
                if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)):
                    addStart=True
                if level:
                    levelList.append("{}：".format(level)+paragraph.text)
                    levelText=f"{int(level)+1}级标题-"+text
                else:
                    if addStart:
                        if(text.startswith("图") or text.startswith("注：")):
                            continue
                        if(len(text)>30):
                            i=i+1
                            words.append("{}：".format(levelText)+text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    if len(words)==0:
        raise Exception("checkRepeatText，获取长度为0")
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkRepeatText.txt", 'w', ) as txt_file:
        txt_file.write(text)
    time.sleep(3)
    loader = TextLoader(file_path='checkRepeatText.txt')
    docs = loader.load()
    # print(docs)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10, add_start_index=True,
                                                   separators=["\n\n", "\n"])
    splits = text_splitter.split_documents(docs)
    uuids = []
    for i in range(len(splits)):
        uuids.append(str(uuid.uuid4()))
    logging.info(f"checkRepeatTextuuidLen{len(uuids)}")
    vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
    vectorstore.add_documents(documents=splits, ids=uuids)
    while True:
        time.sleep(0.3)
        ress = vectorstore.similarity_search(words[0])
        if (len(ress) > 0):
            break
    return words,uuids,vectorstore
 # @app.route('/checkRepeatText/<filename>', methods=['GET'])
 def checkRepeatText(filename):
    yield "文档相似性检查---启动中...."
    vector_store_path="vector_store"+str(uuid.uuid4())
    for titleName in findTitleName(filename):
        yield titleName
    if(titleName!="文档相似性检查----未找到与详细设计方案相关内容，无法进行相似性比较"):
        try:
            yield "文档相似性检查----文档内容解析中"
            words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path)
        except Exception as e:
            yield f"文档相似性检查----文档内容获取失败，未找到**{titleName}**相关内容或文档打开失败"
            return
    # 记录程序开始的时间戳‘
        global device_id
        similarity = Taskflow("text_similarity",device_id=3)
        # device_id+=1
        # if(device_id>1):
        #     device_id=0
        reslist = []
        count = 0
        for i in words:
            count += 1
            yield f"文档相似性检查--对{titleName}章节，进行文档内容检查中{count}/{len(words)}"
            result = vectorstore.similarity_search(i)
            textTag = i.split("：")[0]
            for content in result:
                text = content.page_content
                tag = text.split("：")[0].replace('\n', '')
                if (textTag.find(tag) >= 0):
                    continue
                try:
                    res = similarity([[i[i.find('：') + 1:], text[text.find('：') + 1:]]])
                except Exception as e:
                    logger.warning("文档相似性检查--发生异常:",e)
                    logger.warning(i)
                    logger.warning(text)
                if (res[0]["similarity"] > 0.90):
                    # 判断重复内容是否被放入
                    if (len(reslist) > 0):
                        isExist = False
                        for neirong in reslist:
                            if i in neirong.values():
                                isExist = True
                                break
                        if not isExist:
                            # reslist.append({"yuanwen1":i[i.find('：') + 1:],"yuanwen2":text[text.find('：') + 1:],"similarity":res[0]["similarity"]})
                            reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
                    else:
                        reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
                        # print(i.split("：")[1] + "\n" + text.split("：")[1])
        # vectorstore.delete(ids=uuids)
        shutil.rmtree(vector_store_path)
        logger.info("已删除")
        logger.info(reslist)
        resInfo=f"对{titleName}章节，发现相似内容：<br>"
        if(len(reslist)>0):
            for res in reslist:
                resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find('：')]+"**下包含："+res["yuanwen1"][res["yuanwen1"].find('：') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find('：')]+"**下包含："+res["yuanwen2"][res["yuanwen2"].find('：') + 1:]+"<br>以上两段内容***相似度***："+'{:.2f}'.format(res['similarity'])+"】<br>"
            yield resInfo
            logger.info(resInfo)
        else:
            yield "未发现相似内容"
--- a/checkTitleName.py
+++ b/checkTitleName.py
@ -0,0 +1,173 @@
 from docx import Document
 from pprint import pprint
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
    relationships contained in *rels_item_xml*. Returns an empty
    collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ('../NULL', 'NULL'):
                continue
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 import logging
 import logging.config
 log_config = {
    'version': 1,
    'disable_existing_loggers': False,
    'formatters': {
        'standard': {
            'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        },
    },
    'handlers': {
        'console': {
            'class': 'logging.StreamHandler',
            'formatter': 'standard',
            'level': logging.INFO,
        },
        'file': {
            'class': 'logging.FileHandler',
            'filename': 'Logger.log',
            'formatter': 'standard',
            'level': logging.INFO,
        },
    },
    'loggers': {
        '': {
            'handlers': ['console', 'file'],
            'level': logging.INFO,
            'propagate': True,
        },
    }
 }
 logging.config.dictConfig(log_config)
 logger = logging.getLogger("checkCompanyName")
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b-instruct",
    'model_server': 'DashScope',  # base_url, also known as api_base
    'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                )
 # 记录程序开始的时间戳
 def getOutlineLevel(inputXml):
    """
    功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
    参数 inputXml
    返回 number
    """
    start_index = inputXml.find('<w:outlineLvl')
    end_index = inputXml.find('>', start_index)
    number = inputXml[start_index:end_index + 1]
    number = re.search("\d+", number).group()
    return number
 def isTitle(paragraph):
    """
    功能 判断该段落是否设置了大纲等级
    参数 paragraph:段落
    返回 None:普通正文，没有大纲级别 0:一级标题 1:二级标题 2:三级标题
    """
    # 如果是空行，直接返回None
    if paragraph.text.strip() == '':
        return None
    # 如果该段落是直接在段落里设置大纲级别的，根据xml判断大纲级别
    paragraphXml = paragraph._p.xml
    if paragraphXml.find('<w:outlineLvl') >= 0:
        return getOutlineLevel(paragraphXml)
    # 如果该段落是通过样式设置大纲级别的，逐级检索样式及其父样式，判断大纲级别
    targetStyle = paragraph.style
    while targetStyle is not None:
        # 如果在该级style中找到了大纲级别，返回
        if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
            return getOutlineLevel(targetStyle.element.xml)
        else:
            targetStyle = targetStyle.base_style
    # 如果在段落、样式里都没有找到大纲级别，返回None
    return None
 #获取文档中 详细设计方案 章节的所有内容
 def getDocxToTitleName(docxPath):
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            level = isTitle(paragraph)
            if level=="0":
                words.append(text)
    return words
 def checkTitleName(filename):
    yield '文档结构检查----启动中'
    with open("ce模板.txt", "r",encoding='utf-8') as f:
        gettext = f.readlines()
    count=0
    reserr = []
    try:
        word = getDocxToTitleName(filename)
    except Exception as e:
        print(e)
        yield "文档无法打开，请检查文档内容"
        return
    for text in gettext:
        count+=1
        prompt = f'''
        \n 这些是文章的标题，请问【{text}】在标题中是否可以配对的，若有请指出是哪个标题，若没有请回到不存在
        '''
        xushang="回答格式{‘name’:‘名称’,'answer'：‘回答’，“标题”：“标题”}请严格按照格式回答问题，不要做过多我解释"
        yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
        strword = "\n".join(word)+prompt+xushang
        # print(strword)
        messages = [{'role': 'user', 'content': [{'text':strword}]}]
        runList = []
        cishu = 0
        for rsp in bot.run(messages):
            runList.append(rsp)
            # print(rsp)
        data = runList[len(runList) - 1][0]["content"]
        parsed_data = json_repair.loads(data.replace('`', ''))
        print(parsed_data)
        if(parsed_data["answer"]=="不存在"):
            reserr.append(text)
    resInfo="文档结构存在异常：<br>"
    if(len(reserr)>0):
        for i in reserr:
            resInfo+="**"+i.replace('\n','')+"**<br>"
        logger.info(resInfo)
        yield resInfo
    else:
        yield "文档结构未发现异常"
--- a/daijian方案.py
+++ b/daijian方案.py
@ -0,0 +1,176 @@
 from docx import Document
 from pprint import pprint
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 import math
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
    relationships contained in *rels_item_xml*. Returns an empty
    collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ('../NULL', 'NULL'):
                continue
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels
 _SerializedRelationships.load_from_xml = load_from_xml_v2
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b-instruct",
    'model_server': 'DashScope',  # base_url, also known as api_base
    'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                )
 # 记录程序开始的时间戳
 def getOutlineLevel(inputXml):
    """
    功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
    参数 inputXml
    返回 number
    """
    start_index = inputXml.find('<w:outlineLvl')
    end_index = inputXml.find('>', start_index)
    number = inputXml[start_index:end_index + 1]
    number = re.search("\d+", number).group()
    return number
 def isTitle(paragraph):
    """
    功能 判断该段落是否设置了大纲等级
    参数 paragraph:段落
    返回 None:普通正文，没有大纲级别 0:一级标题 1:二级标题 2:三级标题
    """
    # 如果是空行，直接返回None
    if paragraph.text.strip() == '':
        return None
    # 如果该段落是直接在段落里设置大纲级别的，根据xml判断大纲级别
    paragraphXml = paragraph._p.xml
    if paragraphXml.find('<w:outlineLvl') >= 0:
        return getOutlineLevel(paragraphXml)
    # 如果该段落是通过样式设置大纲级别的，逐级检索样式及其父样式，判断大纲级别
    targetStyle = paragraph.style
    while targetStyle is not None:
        # 如果在该级style中找到了大纲级别，返回
        if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
            return getOutlineLevel(targetStyle.element.xml)
        else:
            targetStyle = targetStyle.base_style
    # 如果在段落、样式里都没有找到大纲级别，返回None
    return None
 #获取文档中 详细设计方案 章节的所有内容
 def getDocxToTitleName(docxPath):
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            level = isTitle(paragraph)
            if level=="0":
                words.append(text)
    return words
 def checkTitleName(filename):
    prompt = f'''
            \n 这些是文章的标题，请问【{text}】在标题中是否可以配对的，若有请指出是哪个标题，若没有请回到不存在
            '''
    xushang = "回答格式{‘name’:‘名称’,'answer'：‘回答’，“标题”：“标题”}请严格按照格式回答问题，不要做过多我解释"
    yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
    strword = "\n".join(word) + prompt + xushang
    # print(strword)
    messages = [{'role': 'user', 'content': [{'text': strword}]}]
    runList = []
    cishu = 0
    for rsp in bot.run(messages):
        runList.append(rsp)
        # print(rsp)
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
    print(parsed_data)
    # yield '文档结构检查----启动中'
    # with open("ce模板.txt", "r",encoding='utf-8') as f:
    #     gettext = f.readlines()
    # count=0
    # reserr = []
    # try:
    #     word = getDocxToTitleName(filename)
    # except Exception as e:
    #     print(e)
    #     yield "文档无法打开，请检查文档内容"
    #     return
    # for text in gettext:
    #     count+=1
    #     prompt = f'''
    #     \n 这些是文章的标题，请问【{text}】在标题中是否可以配对的，若有请指出是哪个标题，若没有请回到不存在
    #     '''
    #     xushang="回答格式{‘name’:‘名称’,'answer'：‘回答’，“标题”：“标题”}请严格按照格式回答问题，不要做过多我解释"
    #     yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
    #     strword = "\n".join(word)+prompt+xushang
    #     # print(strword)
    #     messages = [{'role': 'user', 'content': [{'text':strword}]}]
    #     runList = []
    #     cishu = 0
    #     for rsp in bot.run(messages):
    #         runList.append(rsp)
    #         # print(rsp)
    #     data = runList[len(runList) - 1][0]["content"]
    #     parsed_data = json_repair.loads(data.replace('`', ''))
    #     print(parsed_data)
    #     if(parsed_data["answer"]=="不存在"):
    #         reserr.append(text)
    # resInfo="文档结构存在异常：<br>"
    # if(len(reserr)>0):
    #     for i in reserr:
    #         resInfo+=f"**{i}**<br>"
    #     yield resInfo
    # else:
    #     yield "文档结构未发现异常"
 import logging
 # 创建一个记录器
 logger = logging.getLogger('my_logger')
 logger.setLevel(logging.DEBUG)
 # 创建一个处理器
 ch = logging.StreamHandler()
 ch.setLevel(logging.DEBUG)
 # 创建一个格式化器并将其添加到处理器中
 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 ch.setFormatter(formatter)
 # 将处理器添加到记录器中
 logger.addHandler(ch)
 try:
 # 记录一些日志消息
    logger.debug('这是一个调试消息')
    logger.info('这是一个信息消息')
    logger.warning('这是一个警告消息')
    logger.error('这是一个错误消息')
    logger.critical('这是一个致命错误消息')
 except Exception as e:
    logger.warning(e)
--- a/json_repair.py
+++ b/json_repair.py
@ -0,0 +1,712 @@
 """
 This module will parse the JSON file following the BNF definition:
    <json> ::= <container>
    <primitive> ::= <number> | <string> | <boolean>
    ; Where:
    ; <number> is a valid real number expressed in one of a number of given formats
    ; <string> is a string of valid characters enclosed in quotes
    ; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
    <container> ::= <object> | <array>
    <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
    <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
    <member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
 If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
 - Add the missing parentheses if the parser believes that the array or object should be closed
 - Quote strings or add missing single quotes
 - Adjust whitespaces and remove line breaks
 All supported use cases are in the unit tests
 """
 import os
 import json
 from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
 class StringFileWrapper:
    # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
    def __init__(self, fd: TextIO) -> None:
        self.fd = fd
        self.length: int = 0
    def __getitem__(self, index: Union[int, slice]) -> str:
        if isinstance(index, slice):
            self.fd.seek(index.start)
            value = self.fd.read(index.stop - index.start)
            self.fd.seek(index.start)
            return value
        else:
            self.fd.seek(index)
            return self.fd.read(1)
    def __len__(self) -> int:
        if self.length < 1:
            current_position = self.fd.tell()
            self.fd.seek(0, os.SEEK_END)
            self.length = self.fd.tell()
            self.fd.seek(current_position)
        return self.length
 class LoggerConfig:
    # This is a type class to simplify the declaration
    def __init__(self, log_level: Optional[str]):
        self.log: List[Dict[str, str]] = []
        self.window: int = 10
        self.log_level: str = log_level if log_level else "none"
 JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
 class JSONParser:
    def __init__(
        self,
        json_str: Union[str, StringFileWrapper],
        json_fd: Optional[TextIO],
        logging: Optional[bool],
    ) -> None:
        # The string to parse
        self.json_str = json_str
        # Alternatively, the file description with a json file in it
        if json_fd:
            # This is a trick we do to treat the file wrapper as an array
            self.json_str = StringFileWrapper(json_fd)
        # Index is our iterator that will keep track of which character we are looking at right now
        self.index: int = 0
        # This is used in the object member parsing to manage the special cases of missing quotes in key or value
        self.context: list[str] = []
        # Use this to log the activity, but only if logging is active
        self.logger = LoggerConfig(log_level="info" if logging else None)
    def parse(
        self,
    ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
        json = self.parse_json()
        if self.index < len(self.json_str):
            self.log(
                "The parser returned early, checking if there's more json elements",
                "info",
            )
            json = [json]
            last_index = self.index
            while self.index < len(self.json_str):
                j = self.parse_json()
                if j != "":
                    json.append(j)
                if self.index == last_index:
                    self.index += 1
                last_index = self.index
            # If nothing extra was found, don't return an array
            if len(json) == 1:
                self.log(
                    "There were no more elements, returning the element without the array",
                    "info",
                )
                json = json[0]
        if self.logger.log_level == "none":
            return json
        else:
            return json, self.logger.log
    def parse_json(
        self,
    ) -> JSONReturnType:
        while True:
            char = self.get_char_at()
            # This parser will ignore any basic element (string or number) that is not inside an array or object
            is_in_context = len(self.context) > 0
            # False means that we are at the end of the string provided
            if char is False:
                return ""
            # <object> starts with '{'
            elif char == "{":
                self.index += 1
                return self.parse_object()
            # <array> starts with '['
            elif char == "[":
                self.index += 1
                return self.parse_array()
            # there can be an edge case in which a key is empty and at the end of an object
            # like "key": }. We return an empty string here to close the object properly
            elif char == "}":
                self.log(
                    "At the end of an object we found a key with missing value, skipping",
                    "info",
                )
                return ""
            # <string> starts with a quote
            elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
                return self.parse_string()
            # <number> starts with [0-9] or minus
            elif is_in_context and (char.isdigit() or char == "-" or char == "."):
                return self.parse_number()
            # If everything else fails, we just ignore and move on
            else:
                self.index += 1
    def parse_object(self) -> Dict[str, Any]:
        # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
        obj = {}
        # Stop when you either find the closing parentheses or you have iterated over the entire string
        while (self.get_char_at() or "}") != "}":
            # This is what we expect to find:
            # <member> ::= <string> ': ' <json>
            # Skip filler whitespaces
            self.skip_whitespaces_at()
            # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
            if (self.get_char_at() or "") == ":":
                self.log(
                    "While parsing an object we found a : before a key, ignoring",
                    "info",
                )
                self.index += 1
            # We are now searching for they string key
            # Context is used in the string parser to manage the lack of quotes
            self.set_context("object_key")
            self.skip_whitespaces_at()
            # <member> starts with a <string>
            key = ""
            while self.get_char_at():
                key = str(self.parse_string())
                if key != "" or (key == "" and self.get_char_at() == ":"):
                    # If the string is empty but there is a object divider, we are done here
                    break
            self.skip_whitespaces_at()
            # We reached the end here
            if (self.get_char_at() or "}") == "}":
                continue
            self.skip_whitespaces_at()
            # An extreme case of missing ":" after a key
            if (self.get_char_at() or "") != ":":
                self.log(
                    "While parsing an object we missed a : after a key",
                    "info",
                )
            self.index += 1
            self.reset_context()
            self.set_context("object_value")
            # The value can be any valid json
            value = self.parse_json()
            # Reset context since our job is done
            self.reset_context()
            obj[key] = value
            if (self.get_char_at() or "") in [",", "'", '"']:
                self.index += 1
            # Remove trailing spaces
            self.skip_whitespaces_at()
        self.index += 1
        return obj
    def parse_array(self) -> List[Any]:
        # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
        arr = []
        self.set_context("array")
        # Stop when you either find the closing parentheses or you have iterated over the entire string
        while (self.get_char_at() or "]") != "]":
            self.skip_whitespaces_at()
            value = self.parse_json()
            # It is possible that parse_json() returns nothing valid, so we stop
            if value == "":
                break
            if value == "..." and self.get_char_at(-1) == ".":
                self.log(
                    "While parsing an array, found a stray '...'; ignoring it", "info"
                )
            else:
                arr.append(value)
            # skip over whitespace after a value but before closing ]
            char = self.get_char_at()
            while char and (char.isspace() or char == ","):
                self.index += 1
                char = self.get_char_at()
        # Especially at the end of an LLM generated json you might miss the last "]"
        char = self.get_char_at()
        if char and char != "]":
            self.log(
                "While parsing an array we missed the closing ], adding it back", "info"
            )
            self.index -= 1
        self.index += 1
        self.reset_context()
        return arr
    def parse_string(self) -> Union[str, bool, None]:
        # <string> is a string of valid characters enclosed in quotes
        # i.e. { name: "John" }
        # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
        # Flag to manage corner cases related to missing starting quote
        missing_quotes = False
        doubled_quotes = False
        lstring_delimiter = rstring_delimiter = '"'
        char = self.get_char_at()
        # A valid string can only start with a valid quote or, in our case, with a literal
        while char and char not in ['"', "'", "“"] and not char.isalnum():
            self.index += 1
            char = self.get_char_at()
        if not char:
            # This is an empty string
            return ""
        # Ensuring we use the right delimiter
        if char == "'":
            lstring_delimiter = rstring_delimiter = "'"
        elif char == "“":
            lstring_delimiter = "“"
            rstring_delimiter = "”"
        elif char.isalnum():
            # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
            # But remember, object keys are only of type string
            if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
                value = self.parse_boolean_or_null()
                if value != "":
                    return value
            self.log(
                "While parsing a string, we found a literal instead of a quote",
                "info",
            )
            self.log(
                "While parsing a string, we found no starting quote. Will add the quote back",
                "info",
            )
            missing_quotes = True
        if not missing_quotes:
            self.index += 1
        # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
        if self.get_char_at() == lstring_delimiter:
            # If it's an empty key, this was easy
            if self.get_context() == "object_key" and self.get_char_at(1) == ":":
                self.index += 1
                return ""
            # Find the next delimiter
            i = 1
            next_c = self.get_char_at(i)
            while next_c and next_c != rstring_delimiter:
                i += 1
                next_c = self.get_char_at(i)
            # Now check that the next character is also a delimiter to ensure that we have "".....""
            # In that case we ignore this rstring delimiter
            if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
                self.log(
                    "While parsing a string, we found a valid starting doubled quote, ignoring it",
                    "info",
                )
                doubled_quotes = True
                self.index += 1
            else:
                # Ok this is not a doubled quote, check if this is an empty string or not
                i = 1
                next_c = self.get_char_at(i)
                while next_c and next_c.isspace():
                    i += 1
                    next_c = self.get_char_at(i)
                if next_c not in [",", "]", "}"]:
                    self.log(
                        "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
                        "info",
                    )
                    self.index += 1
        # Initialize our return value
        string_acc = ""
        # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
        # In that case we need to use the ":|,|}" characters as terminators of the string
        # So this will stop if:
        # * It finds a closing quote
        # * It iterated over the entire sequence
        # * If we are fixing missing quotes in an object, when it finds the special terminators
        char = self.get_char_at()
        while char and char != rstring_delimiter:
            if missing_quotes:
                if self.get_context() == "object_key" and (
                    char == ":" or char.isspace()
                ):
                    self.log(
                        "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
                        "info",
                    )
                    break
                elif self.get_context() == "object_value" and char in [",", "}"]:
                    rstring_delimiter_missing = True
                    # check if this is a case in which the closing comma is NOT missing instead
                    i = 1
                    next_c = self.get_char_at(i)
                    while next_c and next_c != rstring_delimiter:
                        i += 1
                        next_c = self.get_char_at(i)
                    if next_c:
                        i += 1
                        next_c = self.get_char_at(i)
                        # found a delimiter, now we need to check that is followed strictly by a comma or brace
                        while next_c and next_c.isspace():
                            i += 1
                            next_c = self.get_char_at(i)
                        if next_c and next_c in [",", "}"]:
                            rstring_delimiter_missing = False
                    if rstring_delimiter_missing:
                        self.log(
                            "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
                            "info",
                        )
                        break
            string_acc += char
            self.index += 1
            char = self.get_char_at()
            if char and len(string_acc) > 0 and string_acc[-1] == "\\":
                # This is a special case, if people use real strings this might happen
                self.log("Found a stray escape sequence, normalizing it", "info")
                string_acc = string_acc[:-1]
                if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
                    escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
                    string_acc += escape_seqs.get(char, char) or char
                    self.index += 1
                    char = self.get_char_at()
            # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
            if char == rstring_delimiter:
                # Special case here, in case of double quotes one after another
                if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
                    self.log(
                        "While parsing a string, we found a doubled quote, ignoring it",
                        "info",
                    )
                    self.index += 1
                elif missing_quotes and self.get_context() == "object_value":
                    # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
                    i = 1
                    next_c = self.get_char_at(i)
                    while next_c and next_c not in [
                        rstring_delimiter,
                        lstring_delimiter,
                    ]:
                        i += 1
                        next_c = self.get_char_at(i)
                    if next_c:
                        # We found a quote, now let's make sure there's a ":" following
                        i += 1
                        next_c = self.get_char_at(i)
                        # found a delimiter, now we need to check that is followed strictly by a comma or brace
                        while next_c and next_c.isspace():
                            i += 1
                            next_c = self.get_char_at(i)
                        if next_c and next_c == ":":
                            # Reset the cursor
                            self.index -= 1
                            char = self.get_char_at()
                            self.log(
                                "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
                                "info",
                            )
                            break
                else:
                    # Check if eventually there is a rstring delimiter, otherwise we bail
                    i = 1
                    next_c = self.get_char_at(i)
                    check_comma_in_object_value = True
                    while next_c and next_c not in [
                        rstring_delimiter,
                        lstring_delimiter,
                    ]:
                        # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
                        # This is because the routine after will make sure to correct any bad guess and this solves a corner case
                        if check_comma_in_object_value and next_c.isalpha():
                            check_comma_in_object_value = False
                        # If we are in an object context, let's check for the right delimiters
                        if (
                            ("object_key" in self.context and next_c in [":", "}"])
                            or ("object_value" in self.context and next_c == "}")
                            or ("array" in self.context and next_c in ["]", ","])
                            or (
                                check_comma_in_object_value
                                and self.get_context() == "object_value"
                                and next_c == ","
                            )
                        ):
                            break
                        i += 1
                        next_c = self.get_char_at(i)
                    # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
                    if next_c == "," and self.get_context() == "object_value":
                        i += 1
                        next_c = self.get_char_at(i)
                        while next_c and next_c != rstring_delimiter:
                            i += 1
                            next_c = self.get_char_at(i)
                        # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
                        i += 1
                        next_c = self.get_char_at(i)
                        while next_c and next_c.isspace():
                            i += 1
                            next_c = self.get_char_at(i)
                        if next_c == "}":
                            # OK this is valid then
                            self.log(
                                "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
                                "info",
                            )
                            string_acc += str(char)
                            self.index += 1
                            char = self.get_char_at()
                    elif next_c == rstring_delimiter:
                        if self.get_context() == "object_value":
                            # But this might not be it! This could be just a missing comma
                            # We found a delimiter and we need to check if this is a key
                            # so find a rstring_delimiter and a colon after
                            i += 1
                            next_c = self.get_char_at(i)
                            while next_c and next_c != rstring_delimiter:
                                i += 1
                                next_c = self.get_char_at(i)
                            i += 1
                            next_c = self.get_char_at(i)
                            while next_c and next_c != ":":
                                if next_c in [
                                    lstring_delimiter,
                                    rstring_delimiter,
                                    ",",
                                ]:
                                    break
                                i += 1
                                next_c = self.get_char_at(i)
                            # Only if we fail to find a ':' then we know this is misplaced quote
                            if next_c != ":":
                                self.log(
                                    "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
                                    "info",
                                )
                                string_acc += str(char)
                                self.index += 1
                                char = self.get_char_at()
        if (
            char
            and missing_quotes
            and self.get_context() == "object_key"
            and char.isspace()
        ):
            self.log(
                "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
                "info",
            )
            self.skip_whitespaces_at()
            if self.get_char_at() not in [":", ","]:
                return ""
        # A fallout of the previous special case in the while loop,
        # we need to update the index only if we had a closing quote
        if char != rstring_delimiter:
            self.log(
                "While parsing a string, we missed the closing quote, ignoring",
                "info",
            )
        else:
            self.index += 1
        return string_acc.rstrip()
    def parse_number(self) -> Union[float, int, str, JSONReturnType]:
        # <number> is a valid real number expressed in one of a number of given formats
        number_str = ""
        number_chars = set("0123456789-.eE/,")
        char = self.get_char_at()
        is_array = self.get_context() == "array"
        while char and char in number_chars and (char != "," or not is_array):
            number_str += char
            self.index += 1
            char = self.get_char_at()
        if len(number_str) > 1 and number_str[-1] in "-eE/,":
            # The number ends with a non valid character for a number/currency, rolling back one
            number_str = number_str[:-1]
            self.index -= 1
        try:
            if "," in number_str:
                return str(number_str)
            if "." in number_str or "e" in number_str or "E" in number_str:
                return float(number_str)
            elif number_str == "-":
                # If there is a stray "-" this will throw an exception, throw away this character
                return self.parse_json()
            else:
                return int(number_str)
        except ValueError:
            return number_str
    def parse_boolean_or_null(self) -> Union[bool, str, None]:
        # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
        starting_index = self.index
        char = (self.get_char_at() or "").lower()
        value: Optional[Tuple[str, Optional[bool]]]
        if char == "t":
            value = ("true", True)
        elif char == "f":
            value = ("false", False)
        elif char == "n":
            value = ("null", None)
        if value:
            i = 0
            while char and i < len(value[0]) and char == value[0][i]:
                i += 1
                self.index += 1
                char = (self.get_char_at() or "").lower()
            if i == len(value[0]):
                return value[1]
        # If nothing works reset the index before returning
        self.index = starting_index
        return ""
    def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
        # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
        try:
            return self.json_str[self.index + count]
        except IndexError:
            return False
    def skip_whitespaces_at(self) -> None:
        """
        This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
        """
        try:
            char = self.json_str[self.index]
        except IndexError:
            return
        while char.isspace():
            self.index += 1
            try:
                char = self.json_str[self.index]
            except IndexError:
                return
    def set_context(self, value: str) -> None:
        # If a value is provided update the context variable and save in stack
        if value:
            self.context.append(value)
    def reset_context(self) -> None:
        self.context.pop()
    def get_context(self) -> str:
        return self.context[-1]
    def log(self, text: str, level: str) -> None:
        if level == self.logger.log_level:
            context = ""
            start = max(self.index - self.logger.window, 0)
            end = min(self.index + self.logger.window, len(self.json_str))
            context = self.json_str[start:end]
            self.logger.log.append(
                {
                    "text": text,
                    "context": context,
                }
            )
 def repair_json(
    json_str: str = "",
    return_objects: bool = False,
    skip_json_loads: bool = False,
    logging: bool = False,
    json_fd: Optional[TextIO] = None,
    ensure_ascii: bool = True,
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
    It will return the fixed string by default.
    When `return_objects=True` is passed, it will return the decoded data structure instead.
    When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
    When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
    """
    parser = JSONParser(json_str, json_fd, logging)
    if skip_json_loads:
        parsed_json = parser.parse()
    else:
        try:
            if json_fd:
                parsed_json = json.load(json_fd)
            else:
                parsed_json = json.loads(json_str)
        except json.JSONDecodeError:
            parsed_json = parser.parse()
    # It's useful to return the actual object instead of the json string,
    # it allows this lib to be a replacement of the json library
    if return_objects or logging:
        return parsed_json
    return json.dumps(parsed_json, ensure_ascii=ensure_ascii)
 def loads(
    json_str: str,
    skip_json_loads: bool = False,
    logging: bool = False,
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    This function works like `json.loads()` except that it will fix your JSON in the process.
    It is a wrapper around the `repair_json()` function with `return_objects=True`.
    """
    return repair_json(
        json_str=json_str,
        return_objects=True,
        skip_json_loads=skip_json_loads,
        logging=logging,
    )
 def load(
    fd: TextIO, skip_json_loads: bool = False, logging: bool = False
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    This function works like `json.load()` except that it will fix your JSON in the process.
    It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
    """
    return repair_json(
        json_fd=fd,
        return_objects=True,
        skip_json_loads=skip_json_loads,
        logging=logging,
    )
 def from_file(
    filename: str,
    skip_json_loads: bool = False,
    logging: bool = False,
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    This function is a wrapper around `load()` so you can pass the filename as string
    """
    fd = open(filename)
    jsonobj = load(fd, skip_json_loads, logging)
    fd.close()
    return jsonobj
--- a/main.py
+++ b/main.py
@ -0,0 +1,161 @@
 from flask import Flask, request, jsonify,Response
 import os
 from checkPlaceName import checkPlaceName
 from checkRepeatText import checkRepeatText
 from checkCompanyName import checkCompanyName
 from checkDocumentError import getDocumentError
 from checkTitleName import checkTitleName
 from flask_cors import CORS
 import qwen_agenttext
 app = Flask(__name__)
 cros = CORS(app)
 UPLOAD_FOLDER = 'uploads'
 usableTag=[0,0,0,0,0,0,0,0]
 if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)
@app.route('/upload', methods=['POST'])
 def upload_file():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400
    if file:
        filename = file.filename
        file.save(os.path.join(UPLOAD_FOLDER,filename))
        return jsonify({"message": "File uploaded successfully"}), 200
@app.route('/stream' ,methods=["GET", "POST"])
 def stream_numbers():
    context= request.args.get('context')
    # def generate_numbers():
    #     event_id=0
    #     for number in range(1, 10):
    #         json_data = json.dumps({"number": number})
    #         print(json_data)
    #         event_id += 1
    #         yield f"id: {event_id}\n"
    #         yield f"event: time-update\n"
    #         yield f"data: {json_data}\n\n"  # 每次生成一个数字就发送
    #         time.sleep(0.5)  # 为了演示，加入短暂延迟
    #     json_data = json.dumps({"number": "done"})
    #     yield f"id: {1}\n"
    #     yield f"event: time-update\n"
    #     yield f"data: {json_data}\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
        "X-Accel-Buffering": "no",
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(qwen_agenttext.getxinx(context),headers=headers)
@app.route('/sse/checkRepeatText', methods=['GET'])
 def checkRepeatTextWeb():
    filename = request.args.get('filename')
    def generate_checkRepeatText(filename):
        id=0
        try:
            for i in checkRepeatText(filename):
                yield f"id: {id+1}\n"
                yield f"event: checkRepeatText\n"
                yield f"data: {i}\n\n"  # 发送完成信号
        except Exception as e:
            yield f"id: {id+1}\n"
            yield f"event: checkRepeatText\n"
            yield f"data: **程序出现异常**\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
        "X-Accel-Buffering": "no",
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(generate_checkRepeatText(filename), headers=headers)
@app.route('/sse/checkPlaceName', methods=['GET'])
 def checkPlaceNameWebSse():
    filename = request.args.get('filename')
    def generate_checkPlaceName(filename):
        id=0
        for i in checkPlaceName(filename):
            yield f"id: {id+1}\n"
            yield f"event: checkPlaceName\n"
            yield f"data: {i}\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
        "X-Accel-Buffering": "no",
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(generate_checkPlaceName(filename), headers=headers)
@app.route('/sse/checkCompanyName', methods=['GET'])
 def checkCompanyNameWebSse():
    filename = request.args.get('filename')
    def generate_checkCompanyName(filename):
        id = 0
        for i in checkCompanyName(filename):
            yield f"id: {id + 1}\n"
            yield f"event: checkCompanyName\n"
            yield f"data: {i}\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
        "X-Accel-Buffering": "no",
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(generate_checkCompanyName(filename), headers=headers)
@app.route('/sse/checkDocumentErrorWeb', methods=['GET'])
 def checkDocumentErrorWebSse():
    filename = request.args.get('filename')
    def generate_checkDocumentError(filename):
        id = 0
        for i in getDocumentError(filename):
            yield f"id: {id + 1}\n"
            yield f"event: getDocumentError\n"
            yield f"data: {i}\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
        "X-Accel-Buffering": "no",
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(generate_checkDocumentError(filename), headers=headers)
@app.route('/sse/checkTitleName', methods=['GET'])
 def checkTitleNameWebSse():
    filename = request.args.get('filename')
    def generate_checkTitleName(filename):
        id = 0
        for i in checkTitleName(filename):
            yield f"id: {id + 1}\n"
            yield f"event: checkTitleName\n"
            yield f"data: {i}\n\n"  # 发送完成信号
    headers = {
        "Content-Type": "text/event-stream",
        "Cache-Control": "no-cache",
        "X-Accel-Buffering": "no",
        "Access-Control-Allow-Origin": "*",
        "Access-Control-Allow-Methods": "GET,POST",
        "Access-Control-Allow-Headers": "x-requested-with,content-type",
    }
    return Response(generate_checkTitleName(filename), headers=headers)
 if __name__ == '__main__':
    app.run(host="0.0.0.0",port=80)
--- a/qwen_agenttext.py
+++ b/qwen_agenttext.py
@ -0,0 +1,132 @@
 import pprint
 import urllib.parse
 import json5
 from qwen_agent.agents import Assistant
 from qwen_agent.tools.base import BaseTool, register_tool
 import requests
 import baidusearch
 import tqdm
 # 使用示例
 # Step 1 (Optional): Add a custom tool named `my_image_gen`.
@register_tool('my_image_gen')
 class MyImageGen(BaseTool):
    # The `description` tells the agent the functionality of this tool.
    description = 'AI painting (image generation) service, input text description, and return the image URL drawn based on text information.'
    # The `parameters` tell the agent what input parameters the tool has.
    parameters = [{
        'name': 'prompt',
        'type': 'string',
        'description': 'Detailed description of the desired image content, in English',
        'required': True
    }]
    def call(self, params: str, **kwargs) -> str:
        # `params` are the arguments generated by the LLM agent.
        prompt = json5.loads(params)['prompt']
        # 对提示词进行URL编码
        prompt = urllib.parse.quote(prompt)
        #
        return json5.dumps(
            {'image_url': f'https://image.pollinations.ai/prompt/{prompt}'},
            ensure_ascii=False)
@register_tool('chaxun')
 class MyImageGen(BaseTool):
    # The `description` tells the agent the functionality of this tool.
    description = '如果你不会，请使用此工具进行联网查询'
    # The `parameters` tell the agent what input parameters the tool has.
    parameters = [{
        'name': 'prompt',
        'type': 'string',
        'description': '请你描述需要提问的信息,以此帮助你了解更多的信息',
        'required': True
    }]
    def call(self, params: str, **kwargs) -> str:
        # `params` are the arguments generated by the LLM agent.
        prompt = json5.loads(params)['prompt']
        # 对提示词进行URL编码
        prompt = urllib.parse.quote(prompt)
        #
        search_tool = baidusearch.search(prompt, num_results=20)
        print(search_tool)
        return search_tool
 # Step 2: Configure the LLM you are using.
 # 这里是需要配置模型的地方。需要填写模型名字，以及model_server，即模型所在服务器名字，如果没有，也可以考虑使用api_key。
 llm_cfg = {
    # Use the model service provided by DashScope:
    # model：模型名称
    # model_server：模型所在的服务器
    # api_key： 所使用到的api-key，可以显示的设置，也可以从环境变量中获取
    'model':"qwen2-72b-instruct",
    'model_server': 'DashScope',  # base_url, also known as api_base
    'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
    # 'api_key': 'YOUR_DASHSCOPE_API_KEY',
    # It will use the `DASHSCOPE_API_KEY' environment variable if 'api_key' is not set here.
    # Use a model service compatible with the OpenAI API, such as vLLM or Ollama:
    # 'model': 'Qwen1.5-7B-Chat',
    # 'model_server': 'http://localhost:8000/v1',  # base_url, also known as api_base
    # 'api_key': 'EMPTY',
    # (Optional) LLM hyperparameters for generation:
    # 用于调整生成参数的可选配置
    'generate_cfg': {
        'top_p': 0.8
    }
 }
 # Step 3: Create an agent. Here we use the `Assistant` agent as an example, which is capable of using tools and reading files.
 # agent的提示词指令
 system_instruction = '''
 你是一个乐于助人的助手。
 收到用户的请求后，您应：
 你应该进行思考，判断是否使用工具，
 如果遇到你不会回答,请使用工具[chaxun]
 '''
 # 工具列表，指定Assistant可以访问的工具，一个是自定义的工具，一个是代码执行器
 tools = ["chaxun"]  # `code_interpreter` is a built-in tool for executing code.
 # 助理可以读取的文件路径
 # files = ['./examples/resource/doc.pdf']  # Give the bot a PDF file to read.
 # 初始化Assistant
 bot = Assistant(llm=llm_cfg,
                system_message=system_instruction,
                function_list=tools,
                # files=files
                )
 # Step 4: Run the agent as a chatbot.
 messages = []  # This stores the chat history.
 def getxinx(context):
    # For example, enter the query "draw a dog and rotate it 90 degrees".
    # query = input('user query: ')
    # Append the user query to the chat history.
    messages=[({'role': 'user', 'content': context})]
    print(messages)
    response = []
    event_id = 0
    for rsp in bot.run(messages=messages):
        response.append(rsp)
        yield "请稍等.."
    # len()
    # for i in bot.run(messages=messages):
    #     #     for number in range(1, 10):
    #     print(i)
    #     print(i[len(i)-1]['content'])
    #     event_id += 1
    #     yield f"id: {event_id}\n"
    #     yield f"event: time-update\n"
    #     if(i[len(i)-1]['role']=='assistant'):
    #         yield "data: {}\n\n".format(str(i[len(i)-1]['content'].replace('\n\n','')))  # 每次生成一个数字就发送
    #     else:
    #         yield f"data: \n\n"  # 每次生成一个数字就发送
    # Streaming output.
--- a/test.py
+++ b/test.py
@ -0,0 +1,109 @@
 import time
 import json
 import math
 from flask import Flask,Response,request
 from flask_sse import  sse
 from flask_cors import CORS
 import re
 import qwen_agenttext
 app = Flask(__name__)
 cros = CORS(app)
 # SSE 推送函数
 import paddle;
 paddle.device.get_available_device()
 # SSE 推送路由
 # @app.route('/register', methods=["GET"])
 # def register():
    # 获取客户端标识符
    # client_id = str(uuid.uuid4())
    #
    # # 返回 SSE 响应
    # return jsonify({"client_id": client_id})
 # SSE 推送路由
 # @app.route('/sse', methods=['POST'])
 # def stream():
 #     # 获取客户端标识符
 #     client_id = 1
 #     print("client_id", client_id)
 #
 #     def aa():
 #         # 循环发送 SSE 数据
 #         for i in range(10):
 #             data = 'Hello, %s!' % client_id + str(i)
 #             print(data)
 #             sse.publish(data, channel=client_id, type='message')
 #             time.sleep(1)
 #         sse.publish("end", channel=client_id, type='message')
 #
 #     # 返回 SSE 响应
 #     response = Response(aa(), mimetype='text/event-stream')
 #     response.headers.add('Cache-Control', 'no-cache')
 #     response.headers.add('Connection', 'keep-alive')
 #     response.headers.add('X-Accel-Buffering', 'no')
 #     return response
 #
 #
 #
 # @app.route('/stream' ,methods=["GET", "POST"])
 # def stream_numbers():
 #     context= request.args.get('context')
 #
 #
 #     headers = {
 #         "Content-Type": "text/event-stream",
 #         "Cache-Control": "no-cache",
 #         "X-Accel-Buffering": "no",
 #         "Access-Control-Allow-Origin": "*",
 #         "Access-Control-Allow-Methods": "GET,POST",
 #         "Access-Control-Allow-Headers": "x-requested-with,content-type",
 #     }
 #     return Response(generate_numbers(),headers=headers)
 # def generate_numbers():
 #     event_id=0
 #     # for number in range(1, 10):
 #     #     json_data = json.dumps({"number": number})
 #     #     print(json_data)
 #     #     event_id += 1
 #     #     yield f"id: {event_id}\n"
 #     #     yield f"event: time-update\n"
 #     #     yield f"data: {json_data}\n\n"  # 每次生成一个数字就发送
 #     json_data = json.dumps({"number": "done"})
 #     yield f"id: {1}\n"
 #     yield f"event: time-update\n"
 #     yield f"data: 34568\n\n"  # 发送完成信号
 # if __name__ == '__main__':
 #
 #
 #     # 读取文件内容
 #     with open("checkPlaceName.txt", "r", encoding='utf-8') as f:
 #         gettext = f.read()
 #     batchNum=20
 #     sentences = re.split(r'[。\n]', gettext)
 #     # 去掉空字符
 #     sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
 #     # 计算总字符数
 #     total_chars = len(sentences)
 #
 #     # 计算有多少份
 #     num_chunks = math.ceil(total_chars / batchNum)
 #
 #     # 按batchNum字为一份进行处理
 #     chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
 #
 #     # 打印每一份的内容
 #     for i, chunk in enumerate(chunks):
 #         print(f"Chunk {i + 1}:")
 #         print(chunk)
 #         print("-" * 40)
 #
 #     # 打印总份数
 #     print(f"Total chunks: {num_chunks}")
 #     app.run(debug=True,port=80)
--- a/workspace/1.png
+++ b/workspace/1.png
--- a/workspace/image14.png
+++ b/workspace/image14.png
--- a/workspace/image15.png
+++ b/workspace/image15.png
--- a/workspace/image16.png
+++ b/workspace/image16.png
--- a/workspace/image17.png
+++ b/workspace/image17.png
--- a/workspace/image18.png
+++ b/workspace/image18.png
--- a/workspace/image19.png
+++ b/workspace/image19.png
--- a/workspace/image20.png
+++ b/workspace/image20.png
--- a/workspace/tools/code_interpreter/05613c9c-c910-455d-8c8b-62b7dc243b2a.png
+++ b/workspace/tools/code_interpreter/05613c9c-c910-455d-8c8b-62b7dc243b2a.png
--- a/workspace/tools/code_interpreter/1560f103-f2dc-49e3-88c2-35f5d500bc1d.png
+++ b/workspace/tools/code_interpreter/1560f103-f2dc-49e3-88c2-35f5d500bc1d.png
--- a/workspace/tools/code_interpreter/4aa3a1fe-7fc2-440f-8bd9-653ee1721776.png
+++ b/workspace/tools/code_interpreter/4aa3a1fe-7fc2-440f-8bd9-653ee1721776.png
--- a/workspace/tools/code_interpreter/54b7ad57-9c89-4977-b49a-eaf7e60b9656.png
+++ b/workspace/tools/code_interpreter/54b7ad57-9c89-4977-b49a-eaf7e60b9656.png
--- a/workspace/tools/code_interpreter/c8cba059-ac85-42b0-b197-1c8e1e7182c9.png
+++ b/workspace/tools/code_interpreter/c8cba059-ac85-42b0-b197-1c8e1e7182c9.png
--- a/workspace/tools/code_interpreter/kernel_connection_file_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.json
+++ b/workspace/tools/code_interpreter/kernel_connection_file_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.json
@ -0,0 +1,12 @@
 {
  "shell_port": 3199,
  "iopub_port": 3205,
  "stdin_port": 3200,
  "control_port": 3201,
  "hb_port": 3209,
  "ip": "127.0.0.1",
  "key": "41711130-ba4287db5e2a6e7b98444c31",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
 }
--- a/workspace/tools/code_interpreter/kernel_connection_file_113f0326-0345-475c-85c1-86af71d668c0_24876.json
+++ b/workspace/tools/code_interpreter/kernel_connection_file_113f0326-0345-475c-85c1-86af71d668c0_24876.json
@ -0,0 +1,12 @@
 {
  "shell_port": 36295,
  "iopub_port": 36301,
  "stdin_port": 36296,
  "control_port": 36297,
  "hb_port": 36305,
  "ip": "127.0.0.1",
  "key": "0faec31a-0f91a316abd70cf50f57dbad",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
 }
--- a/workspace/tools/code_interpreter/kernel_connection_file_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.json
+++ b/workspace/tools/code_interpreter/kernel_connection_file_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.json
@ -0,0 +1,12 @@
 {
  "shell_port": 5355,
  "iopub_port": 5362,
  "stdin_port": 5356,
  "control_port": 5358,
  "hb_port": 5366,
  "ip": "127.0.0.1",
  "key": "de89d28a-7beb5da33100363d2c20fd6b",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
 }
--- a/workspace/tools/code_interpreter/kernel_connection_file_a3131ded-afec-43fa-95eb-d2f35548a411_39868.json
+++ b/workspace/tools/code_interpreter/kernel_connection_file_a3131ded-afec-43fa-95eb-d2f35548a411_39868.json
@ -0,0 +1,12 @@
 {
  "shell_port": 3079,
  "iopub_port": 3085,
  "stdin_port": 3080,
  "control_port": 3081,
  "hb_port": 3089,
  "ip": "127.0.0.1",
  "key": "1825b8a3-a33137bc69e3375f26f384a3",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
 }
--- a/workspace/tools/code_interpreter/kernel_connection_file_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.json
+++ b/workspace/tools/code_interpreter/kernel_connection_file_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.json
@ -0,0 +1,12 @@
 {
  "shell_port": 36740,
  "iopub_port": 36746,
  "stdin_port": 36741,
  "control_port": 36742,
  "hb_port": 36750,
  "ip": "127.0.0.1",
  "key": "ac6de478-4a3be71d79c2c63da7065148",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
 }
--- a/workspace/tools/code_interpreter/kernel_connection_file_d624f7a6-914d-48c1-b902-4e298f92b671_20484.json
+++ b/workspace/tools/code_interpreter/kernel_connection_file_d624f7a6-914d-48c1-b902-4e298f92b671_20484.json
@ -0,0 +1,12 @@
 {
  "shell_port": 2563,
  "iopub_port": 2569,
  "stdin_port": 2564,
  "control_port": 2565,
  "hb_port": 2573,
  "ip": "127.0.0.1",
  "key": "7e020774-be96933cbe5aaad90c1c9bfc",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
 }
--- a/workspace/tools/code_interpreter/kernel_connection_file_ec74ca73-6455-4a78-96b1-542747f19a25_39260.json
+++ b/workspace/tools/code_interpreter/kernel_connection_file_ec74ca73-6455-4a78-96b1-542747f19a25_39260.json
@ -0,0 +1,12 @@
 {
  "shell_port": 5840,
  "iopub_port": 5846,
  "stdin_port": 5841,
  "control_port": 5842,
  "hb_port": 5850,
  "ip": "127.0.0.1",
  "key": "e4c27d68-1c3a9dfa16551f35481b05b8",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
 }
--- a/workspace/tools/code_interpreter/launch_kernel_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.py
+++ b/workspace/tools/code_interpreter/launch_kernel_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.py
@ -0,0 +1,3 @@
 from ipykernel import kernelapp as app
 app.launch_new_instance()
--- a/workspace/tools/code_interpreter/launch_kernel_113f0326-0345-475c-85c1-86af71d668c0_24876.py
+++ b/workspace/tools/code_interpreter/launch_kernel_113f0326-0345-475c-85c1-86af71d668c0_24876.py
@ -0,0 +1,3 @@
 from ipykernel import kernelapp as app
 app.launch_new_instance()
--- a/workspace/tools/code_interpreter/launch_kernel_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.py
+++ b/workspace/tools/code_interpreter/launch_kernel_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.py
@ -0,0 +1,3 @@
 from ipykernel import kernelapp as app
 app.launch_new_instance()
--- a/workspace/tools/code_interpreter/launch_kernel_a3131ded-afec-43fa-95eb-d2f35548a411_39868.py
+++ b/workspace/tools/code_interpreter/launch_kernel_a3131ded-afec-43fa-95eb-d2f35548a411_39868.py
@ -0,0 +1,3 @@
 from ipykernel import kernelapp as app
 app.launch_new_instance()
--- a/workspace/tools/code_interpreter/launch_kernel_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.py
+++ b/workspace/tools/code_interpreter/launch_kernel_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.py
@ -0,0 +1,3 @@
 from ipykernel import kernelapp as app
 app.launch_new_instance()
--- a/workspace/tools/code_interpreter/launch_kernel_d624f7a6-914d-48c1-b902-4e298f92b671_20484.py
+++ b/workspace/tools/code_interpreter/launch_kernel_d624f7a6-914d-48c1-b902-4e298f92b671_20484.py
@ -0,0 +1,3 @@
 from ipykernel import kernelapp as app
 app.launch_new_instance()
--- a/workspace/tools/code_interpreter/launch_kernel_ec74ca73-6455-4a78-96b1-542747f19a25_39260.py
+++ b/workspace/tools/code_interpreter/launch_kernel_ec74ca73-6455-4a78-96b1-542747f19a25_39260.py
@ -0,0 +1,3 @@
 from ipykernel import kernelapp as app
 app.launch_new_instance()
--- a/workspace/tools/code_interpreter/temp_image.png
+++ b/workspace/tools/code_interpreter/temp_image.png
--- a/workspace/tools/doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_500
+++ b/workspace/tools/doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_500
--- a/workspace/tools/simple_doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_ori
+++ b/workspace/tools/simple_doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_ori
--- a/代码段存储.py
+++ b/代码段存储.py
@ -0,0 +1,140 @@
 from docx import Document
 from paddlenlp import Taskflow
 from pprint import pprint
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 import time
 import math
 tagTask = Taskflow("ner")
 prompt='''
 .上述文本判断地名是否正确，你可以使用工具利用互联网查询，你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 不做过多的解释,严格按回答格式作答;
 '''
 # prompt='''
 # .请回答以上问题，
 # ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 # 不做过多的解释,严格按回答格式作答;
 # '''
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                # description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'
                )
 #获取全文内容
 def getDocxToTextAll(name):
    docxPath=name
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            # print("非空")
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    print("placeNameTask",len(words))
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 #得到全文和地名有关的内容
 def placeNameTask(text):
    batchNum=20
    sentences = re.split(r'[。\n]', text)
    # 去掉空字符
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # 计算总字符数
    total_chars = len(sentences)
    # 计算有多少份
    num_chunks = math.ceil(total_chars / batchNum)
    # 按batchNum字为一份进行处理
    chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
    placeList = []
    # 打印每一份的内容
    for i, chunk in enumerate(chunks):
        yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
        wenBen=".".join(chunk)
        print(chunk)
        res = tagTask(wenBen)
        isplace = False
        for zuhe in res:
            # 上一个的地名,这一个还是地名，就和上一个相加代替这个
            if isplace:
                name = placeList[len(placeList) - 1]
                if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0:  # or zuhe[1] == "ns"
                    isplace = True
                    new_text = zuhe[0].replace("\n", "")
                    placeList[len(placeList) - 1] = name + new_text
                    continue
            if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0:
                isplace = True
                new_text = zuhe[0].replace("\n", "")
                placeList.append(new_text)
            else:
                isplace = False
        print("-" * 40)
    # 打印总份数
    yield "文档地名检查---文档解析完成"
    placeList=list(dict.fromkeys(placeList))
    yield placeList
 #主方法
 def checkPlaceName(filename):
    yield f"文档地名检查---开始处理文档..."  # 每次生成一个数字就发送
    getDocxToTextAll(filename)
    with open("checkPlaceName.txt", "r",encoding='utf-8') as f:
        gettext = f.read()
    yield f"文档地名检查---开始解析文档..."  # 每次生成一个数字就发送
    # propnList=placeNameTask(gettext)
    for item in placeNameTask(gettext):
        if isinstance(item, str):
            yield item
        else:
            final_list = item  # 获取最终结果
    propnStr = ",".join(final_list)
    print("placeNameTask",propnStr)
    messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
    runList = []
    yield f"文档地名检查---结果生成中..."  # 每次生成一个数字就发送
    cishu=0
    for rsp in bot.run(messages):
        runList.append(rsp)
        if cishu>3:
            cishu=0
        yield "文档地名检查---结果生成中"+'.'*cishu
        cishu+=1
    data = runList[len(runList) - 1][0]["content"]
    print("placeNameTask",data)
    parsed_data = json_repair.loads(data.replace('`', ''))
    # 如果需要进一步操作，例如只关注“正确”的回答
    error_places = [place for place in parsed_data if place['回答'] == '错误']
    print("placeNameTask",error_places)
    returnInfo = "发现异常地名<br />";
    if len(error_places)>0:
        for t in error_places:
            keyword= t['placeName']
        # 查找包含关键字的段落
            paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
            yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
            returnInfo+="原文：" + yuanwen + "<br />出现异常地名：**" + keyword + "**！请注意" + "<br />";
        yield returnInfo
        print(returnInfo)
    else:
        yield "**未发现发现异常地名**"
--- a/文档一二级标题识别与提取.py
+++ b/文档一二级标题识别与提取.py
@ -0,0 +1,118 @@
 import re
 import time
 from docx import Document
 from pprint import pprint
 # from paddlenlp import Taskflow
 #
 # similarity = Taskflow("text_similarity", truncation=True, max_length=102400)
 def getOutlineLevel(inputXml):
    """
    功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
    参数 inputXml
    返回 number
    """
    start_index = inputXml.find('<w:outlineLvl')
    end_index = inputXml.find('>', start_index)
    number = inputXml[start_index:end_index + 1]
    number = re.search("\d+", number).group()
    return number
 def isTitle(paragraph):
    """
    功能 判断该段落是否设置了大纲等级
    参数 paragraph:段落
    返回 None:普通正文，没有大纲级别 0:一级标题 1:二级标题 2:三级标题
    """
    # 如果是空行，直接返回None
    if paragraph.text.strip() == '':
        return None
    # 如果该段落是直接在段落里设置大纲级别的，根据xml判断大纲级别
    paragraphXml = paragraph._p.xml
    if paragraphXml.find('<w:outlineLvl') >= 0:
        return getOutlineLevel(paragraphXml)
    # 如果该段落是通过样式设置大纲级别的，逐级检索样式及其父样式，判断大纲级别
    targetStyle = paragraph.style
    while targetStyle is not None:
        # 如果在该级style中找到了大纲级别，返回
        if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
            return getOutlineLevel(targetStyle.element.xml)
        else:
            targetStyle = targetStyle.base_style
    # 如果在段落、样式里都没有找到大纲级别，返回None
    return None
 def getDocxToText12biaoti(name):
    document = Document(name)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    levelText=""
    i = 0
    firstTitle = 0
    secondTitle = 0
    sanjiTitle = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            # print("非空")
            # words.append(text)
            level = isTitle(paragraph)
            if level=="0":
                firstTitle+=1
                secondTitle = 0
                if(text.find("附件")>=0):
                    continue
                words.append("{}:".format(firstTitle)+text)
            elif level=="1":
                secondTitle+=1
                sanjiTitle=0
                # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
                words.append("{}.{}".format(firstTitle,secondTitle)+text)
            elif level=="2":
                sanjiTitle += 1
                # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
                words.append("{}.{}.{}".format(firstTitle, secondTitle,sanjiTitle) + text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    print(len(words))
    if len(words)==0:
        raise Exception("I know python!")
    text = '\n'.join(words)
    with open("ce1.txt", 'w',encoding="utf-8") as txt_file:
        txt_file.write(text)
    return words
 mobanList=[]
 dangqianList=[]
 errorList =[]
    # 将文本写入txt文件
 # with open("ce模板.txt", 'r',encoding="utf-8") as txt_file:
 #     for i in txt_file:
 #         i=re.sub(r'[\t\n]', '', i)
 #         mobanList.append(i)
 # pprint(mobanList)
 # dangqianList=getDocxToText12biaoti("1.docx")
 # if len(dangqianList)!=len(mobanList):
 #     print("标题数量与模板不一致")
 # for num in range(len(mobanList)):
 #     moban = mobanList[num]
 #     dangqian= dangqianList[num]
 #     fenshu=similarity([[dangqian,moban]])
 #     pprint(fenshu)
 #     if (fenshu[0]["similarity"]<0.85):
 #         errorList.append(dangqianList)
 # getDocxToText12biaoti("1.docx")
 # pprint(errorList)
 prompt = '''{}这是文档大纲，根据大纲分析文档中是否有{}这块内容的描述,若不存在请回答不存在
 '''
 dagang ="1"
 biaozhun="2"
 print(prompt.format(dagang, biaozhun))
--- a/文档图片提取.py
+++ b/文档图片提取.py
@ -0,0 +1,282 @@
 import re
 import os
 import docx
 from docx.document import Document
 from docx.text.paragraph import Paragraph
 from docx.parts.image import ImagePart
 from qwen_agent.agents import Assistant
 from docx.oxml.table import CT_Tbl
 from docx.oxml.text.paragraph import CT_P
 import shutil
 import re
 import json_repair
 import uuid
 # 记录程序开始的时间戳
 def getOutlineLevel(inputXml):
    """
    功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
    参数 inputXml
    返回 number
    """
    start_index = inputXml.find('<w:outlineLvl')
    end_index = inputXml.find('>', start_index)
    number = inputXml[start_index:end_index + 1]
    number = re.search("\d+", number).group()
    return number
 def isTitle(paragraph):
    """
    功能 判断该段落是否设置了大纲等级
    参数 paragraph:段落
    返回 None:普通正文，没有大纲级别 0:一级标题 1:二级标题 2:三级标题
    """
    # 如果是空行，直接返回None
    if paragraph.text.strip() == '':
        return None
    # 如果该段落是直接在段落里设置大纲级别的，根据xml判断大纲级别
    paragraphXml = paragraph._p.xml
    if paragraphXml.find('<w:outlineLvl') >= 0:
        return getOutlineLevel(paragraphXml)
    # 如果该段落是通过样式设置大纲级别的，逐级检索样式及其父样式，判断大纲级别
    targetStyle = paragraph.style
    while targetStyle is not None:
        # 如果在该级style中找到了大纲级别，返回
        if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
            return getOutlineLevel(targetStyle.element.xml)
        else:
            targetStyle = targetStyle.base_style
    # 如果在段落、样式里都没有找到大纲级别，返回None
    return None
 # 该行只能有一个图片
 def is_image(graph: Paragraph, doc: Document):
    images = graph._element.xpath('.//pic:pic')  # 获取所有图片
    for image in images:
        for img_id in image.xpath('.//a:blip/@r:embed'):  # 获取图片id
            part = doc.part.related_parts[img_id]  # 根据图片id获取对应的图片
            if isinstance(part, ImagePart):
                return True
    return False
 # 获取图片（该行只能有一个图片）
 def get_ImagePart(graph: Paragraph, doc: Document):
    images = graph._element.xpath('.//pic:pic')  # 获取所有图片
    for image in images:
        for img_id in image.xpath('.//a:blip/@r:embed'):  # 获取图片id
            part = doc.part.related_parts[img_id]  # 根据图片id获取对应的图片
            if isinstance(part, ImagePart):
                return part
    return None
 #寻找标题名称
 def findTitleName(docxPath):
    yield '文档图片信息检查----检查是否存在详细设计方案'
    document = docx.Document(docxPath)
    # 逐段读取docx文档的内容
    titleWords=[]
    firstTitle = 0
    secondTitle = 0
    sanjiTitle = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            level = isTitle(paragraph)
            if level=="0":
                firstTitle+=1
                secondTitle = 0
                if(text.find("附件")>=0):
                    continue
                titleWords.append("一级标题:".format(firstTitle)+text)
            elif level=="1":
                secondTitle+=1
                sanjiTitle=0
                # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
                # titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
            elif level=="2":
                sanjiTitle += 1
                # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
                # titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
    findTitleName_llm_cfg = {
    # 'model':"qwen2-72b",
    # 'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
        'model': "qwen2-72b-instruct",
        'model_server': 'DashScope',  # base_url, also known as api_base
        'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
    }
    findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
                                    name='Assistant',
                                    # system_message='1：这样的是一级标题。1.1：这样的是二级标题。1.1.1：这样的是三级标题'
                                )
    prompt='''\n是文档的大纲，一级标题组成，哪一章存在与方案相关的内容
    类似详细设计方案,详细服务方案，详细建设方案为最相关的，优先选择
    类似设计方案，服务方案，建设方案为次相关，次级选择
    类似方案是最后选择
    按照这样的顺序选择最合适的
    你只能从这两个答案中选择一个：{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"}，不做过多的解释,严格按回答格式作答
    '''
    # print("\n".join(titleWords)+prompt)
    messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
    runList=[]
    for rsp in findTitleName_bot.run(messages):
        runList.append(rsp)
    data = runList[len(runList) - 1][0]["content"]
    parsed_data = json_repair.loads(data.replace('`', ''))
    print(parsed_data)
    if(parsed_data["answer"]=="存在"):
        print("存在",parsed_data["name"])
        yield parsed_data["name"]
    else:
        print("不存在",parsed_data["name"])
        yield "文档图片信息检查----未找到与详细设计方案相关内容，无法进行图文检查"
 def saveImage(fileName,titleName,imagePath):
    fristName=""
    doc = docx.Document(fileName)
    for paragraph in doc.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():  # 非空判断
            level = isTitle(paragraph)
            if level == "0":
                fristName = text
                print(text)
            if level:
                levelText = f"{int(level) + 1}级标题-" + text
        else:
            # 空说明是表格或者图片
            r = is_image(paragraph, doc)
            if r and fristName == titleName:
                part = get_ImagePart(paragraph, doc)
                img_name = levelText+"_"+ os.path.basename(part.partname)
                with open(f'{imagePath}/{img_name}', "wb") as f:
                    f.write(part.blob)
                #保存完成后，上传大模型进行分析
 def checkImageText(filename):
    llm_cfg_vl = {
        #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
        'model':"qwen-vl-max",
        'model_server': 'DashScope',  # base_url, also known as api_base
        'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
    }
    botImage = Assistant(llm=llm_cfg_vl,
                    name='Assistant',
                    # system_message="你是一个地理专家，可以准确的判断地理位置，如果你不确定，可以使用工具"1_image4
                    )
    llm_cfg = {
        #'model': 'qwen1.5-72b-chat',
        'model':"qwen2-72b-instruct",
        'model_server': 'DashScope',  # base_url, also known as api_base
        'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
    }
    bot = Assistant(llm=llm_cfg,
                    name='Assistant',
                    # description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'
                    )
    for titleName in findTitleName(filename):
        yield titleName
    if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容，无法进行图文检查"):
        yield "文档图片信息检查----文档内容解析中"
        imagePath = "Image" + str(uuid.uuid4())
        os.mkdir(imagePath)
        saveImage(filename,titleName,imagePath)
        imagePathList = os.listdir(imagePath)
        count = 0
        resMap={}
        for image in imagePathList:
            count+=1
            yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}"
            outpath=os.path.join("imagePath", image)
            print(outpath)
            messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息，每个信息进行自动分类，不要出现与图中无关的信息，不要删减，不要修改，不要总结内容，不做过多的解释,严格按要求作答'}]}]
            runListImage = []
            for rsp in botImage.run(messagesImage):
                runListImage.append(rsp)
            data = runListImage[len(runListImage) - 1][0]["content"]
            print(str(data))
            prompt='''
            依次上述内容是否与文档有关，你只能在[无关，有关]选项中选择答案,
            按照这样的格式回答[{“text”：“内容”,"answer":"答案"},{“text”：“内容”,"answer":"答案"}]不做过多的解释,严格按回答格式作答
            '''
            messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}]
            runList = []
            for rsp in bot.run(messages):
                runList.append(rsp)
            textdata = runList[len(runList) - 1][0]["content"]
            print(textdata)
            parsed_data = json_repair.loads(textdata)
            print(parsed_data)
            for res in parsed_data:
                if (res["answer"] == "无关"):
                    print("无关", res["name"])
                    map = resMap.get(image)
                    if map:
                        #存在map说明之前已经保存过了
                        resMap[image]=map+"，"+res["text"]
                    else:
                        resMap[image]=res["text"]
            out=''
            if(len(resMap)>0):
                for key,value in resMap:
                    out+=f"在{key}图片中,{value}以上内容在文档中未出现相关描述<br>"
                yield out
            else:
                yield "文档图片信息检查----图文符合要求"
            shutil.rmtree(imagePath)
        # except Exception as e:
        #     yield f"文档图片信息检查----未找到与详细设计方案相关内容，无法进行图文检查"
        #     return
 for i in checkImageText("1.docx"):
    print(i)
 # import docx
 # doc = docx.Document('1.docx')
 # dict_rel = doc.part._rels  # rels其实是个目录
 # for rel in dict_rel:
 #     rel = dict_rel[rel]
 #     print("rel", rel.target_ref)
 #     if "image" in rel.target_ref:
 #         # create_dir(desc_path)
 #         img_name = re.findall("/(.*)", rel.target_ref)[0]  # windos:/
 #         print("img_name", img_name)
 #         word_name = os.path.splitext("1.docx")[0]
 #         print("word_name", word_name)
 #         #检查文件路径分隔符（os.sep），并根据不同的操作系统（Windows或Unix/Linux）处理文件名。
 #         if os.sep in word_name:
 #             new_name = word_name.split('\\')[-1]
 #         else:
 #             new_name = word_name.split('/')[-1]
 #         img_name = f'{new_name}_{img_name}'
 #         print(img_name)
 #         desc_path='workspace'
 #         with open(f'{desc_path}/{img_name}', "wb") as f:
 #             f.write(rel.target_part.blob)
 # #
 # # # prompt='''
 # # # .根据上述文本判断，是否为非泛化的公司或组织名称，你可以使用工具利用互联网查询，你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校，委员单位]选项中选择答案,回答格式[{“placeName”：“名称”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 # # # '''
 # llm_cfg_vl = {
 #     #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
 #     'model':"qwen-vl-max",
 #     'model_server': 'DashScope',  # base_url, also known as api_base
 #     'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 # }
 # botvl = Assistant(llm=llm_cfg_vl,
 #                 name='Assistant',
 #                 # system_message="你是一个地理专家，可以准确的判断地理位置，如果你不确定，可以使用工具"1_image4
 #                 )
 # messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息，每个信息进行自动分类，不要出现与图中无关的信息，不要删减，不要修改，不要总结内容，不做过多的解释,严格按要求作答'}]}]
 # runList = []
 # for rsp in botvl.run(messages):
 #     runList.append(rsp)
 #     print(rsp)
 # data = runList[len(runList) - 1][0]["content"]
 # print(str(data))
--- a/服务器文件/checkCompanyName.py
+++ b/服务器文件/checkCompanyName.py
@ -0,0 +1,133 @@
 # -*- coding:utf-8 -*-
 import time
 from docx import  Document
 from paddlenlp import Taskflow
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 wordtag  = Taskflow("knowledge_mining")
 prompt = '''
 .根据上述文本判断，是否为具体的公司或组织名称，你可以使用工具利用互联网查询，
 你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校，行业类型，其他]选项中选择答案,
 回答格式[{“companyName”：“名称”,"回答":"答案"}，{“companyName”：“名称”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 '''
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                # system_message="你是一个地理专家，可以准确的判断地理位置，如果你不确定，可以使用工具"
                )
 def getDocxToTextAll(name):
    docxPath=name
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            # print("非空")
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    print("checkCompanyName",len(words))
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 def checkCompanyName(filename):
    getDocxToTextAll(filename)
    start_time=time.time()
    error_places = []
    for batch in read_file_in_batches('checkCompanyName.txt'):
        res=process_batch(batch)
        if(len(res)>0):
            error_places.extend(res)
    print(error_places)
    end_time = time.time()
    # 计算执行时间
    elapsed_time = end_time - start_time
    print(f"checkCompanyName程序执行时间: {elapsed_time} 秒")
    return error_places
 def read_file_in_batches(file_path, batch_size=5000):
    """
    分批读取文本文件
    :param file_path: 文件路径
    :param batch_size: 每批处理的字符数
    :return: 生成器，每次返回一批文本
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        batch = []
        char_count = 0
        for line in file:
            batch.append(line)
            char_count += len(line)
            if char_count >= batch_size:
                yield ''.join(batch)
                batch = []
                char_count = 0
        if batch:
            yield ''.join(batch)
 def process_batch(batch):
    """
    处理一批文本
    :param batch: 一批文本
    """
    # 在这里添加你的处理逻辑
    # sentences = re.split(r'[。\n]', batch)
    # sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    res=wordtag(batch)
    placeList = []
    isplace = False
    for zuhe in res[0]['items']:
        # 上一个的地名,这一个还是地名，就和上一个相加代替这个
        zhi = zuhe.get("wordtag_label")
        if isplace:
            name = placeList[len(placeList) - 1]
            if zhi.find("组织机构类")>=0 :  # or zuhe[1] == "ns"
                isplace = True
                new_text = zuhe['item'].replace("\n", "")
                placeList[len(placeList) - 1] = name + new_text
                continue
        if zhi.find("组织机构类")>=0 :
            isplace = True
            new_text = zuhe['item'].replace("\n", "")
            placeList.append(new_text)
        else:
            isplace = False
    placeList=list(dict.fromkeys(placeList))
    placeStr = ",".join(placeList)
    messages = [{'role': 'user', 'content': [{'text': placeStr+prompt}]}]
    print("checkCompanyName",placeStr+prompt)
    runList = []
    for rsp in bot.run(messages):
        runList.append(rsp)
    data = runList[len(runList) - 1][0]["content"]
    print("checkCompanyName",data)
    parsed_data = json_repair.loads(data.replace('`', ''))
    error_places = [place for place in parsed_data if place['回答'] == '具体的公司或组织名称']
    print("checkCompanyName",error_places)
    if len(error_places)>0:
        for t in error_places:
            keyword= t['companyName']
        # 查找包含关键字的段落
            paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch)
            t["yuanwen"]=paragraphs[0]
        return error_places
    else:
        return error_places
--- a/服务器文件/checkDocumentError.py
+++ b/服务器文件/checkDocumentError.py
@ -0,0 +1,226 @@
 #-*- coding:utf-8 -*-
 # from pycorrector import MacBertCorrector
 # m = MacBertCorrector("shibing624/macbert4csc-base-chinese")
 from qwen_agent.agents import Assistant
 from docx import  Document
 from pprint import pprint
 import re
 from paddlenlp import Taskflow
 import json
 import time
 import json_repair
 print(json_repair.loads('{"name":""aaaa"}'))
 start_time = time.time()
 corrector = Taskflow("text_correction")
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                # description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'
                )
 # prompt='''
 # 是否存在错别字，若存在请指出，不做其他方面的校验，你只能在[存在，不存在，未知]选项中选择答案,
 # 回答格式[{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 # '''
 prompt='''
 请回答以上问题，[是，否]选项中选择答案,原文内容，标点符号保持不变，如果有错请给出解析，没有错则不用给解析
 回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","jianyi","解析"},{"placeName":"序号","回答":"答案","jianyi","解析"}]，不做过多的解释,严格按回答格式作答;
 '''
 def getDocxToTextAll(name):
    docxPath=name
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            # print("非空")
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    print("checkDocumentError",len(words))
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 def getDocumentError(filename):
    getDocxToTextAll(filename)
    error_places = []
    # # 打开文件
    for batch in read_file_in_batches('checkDocumentError.txt'):
        res=process_batch(batch)
        if(len(res)>0):
            error_places.extend(res)
    pprint(error_places)
    end_time = time.time()
    # 计算执行时间
    elapsed_time = end_time - start_time
    print(f"checkDocumentError程序执行时间: {elapsed_time} 秒")
    return error_places
    #
    # 过滤掉填充的None（如果有的话）
    # chunk = [line for line in chunk if line is not None]
    # res = m.correct_batch(sentences)
    # print("DocumentError",res)
    # lines_with_greeting = [place for place in res if len( place['errors'])>0]
    # error_places.extend(lines_with_greeting)
    # pprint(error_places)
    # if len(lines_with_greeting)>0:
    #     for t in error_places:
    #         keyword= t['source']
    #
    #         errorWord=t["errors"]
    #     # 查找包含关键字的段落
    #         paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
    #         t["yuanwen"]=paragraphs[0]
    #     return error_places
    # else:
    #     return error_places
    # return lines_with_greeting
 def read_file_in_batches(file_path, batch_size=5000):
    """
    分批读取文本文件
    :param file_path: 文件路径
    :param batch_size: 每批处理的字符数
    :return: 生成器，每次返回一批文本
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        batch = []
        char_count = 0
        for line in file:
            batch.append(line)
            char_count += len(line)
            if char_count >= batch_size:
                yield ''.join(batch)
                batch = []
                char_count = 0
        if batch:
            yield ''.join(batch)
 def process_batch(batch):
    """
    处理一批文本
    :param batch: 一批文本
    """
    # 在这里添加你的处理逻辑
    # error_places=[]
    sentences = re.split(r'[。\n]', batch)
    # 去掉空字符串
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    res = corrector(sentences)
    lines_with_greeting = [place for place in res if len(place['errors']) > 0]
    # error_places.extend(lines_with_greeting)
    # pprint(error_places)
    words=''
    err=[]
    if len(lines_with_greeting) > 0:
        num=0
        wenti=[]#记录问题的数组
        keyword_list = []#记录问题
        for t in lines_with_greeting:
            temp_errorWords = []
            keyword = t['source']
            keyword_list.append(keyword)
            for item in t["errors"]:
                for key, value in item['correction'].items():
                    temp_errorWords.append(key)
            wenti.append("{}、原文：{}。问题：【{}】这些字是否为当前原文的错别字".format(num,keyword,",".join(temp_errorWords)))
            num+=1
        words ="\n".join(wenti)
        messages = [{'role': 'user', 'content': [{'text': words+ prompt}]}]
        runList = []
        print(words+ prompt)
        for rsp in bot.run(messages):
            runList.append(rsp)
        data = runList[len(runList) - 1][0]["content"]
        pprint(data)
        parsed_data = json_repair.loads(data.replace("\\","").replace('`', ''))
        err = [
        {**place, "placeName": keyword_list[int(place["placeName"])],"jianyi":place["解析"]} 
        for place in parsed_data 
        if place['回答'] == '是'
        ]
        pprint(err)
    # err = [place["placeName"]=keyword_list[int(place["placeName"])] for place in parsed_data if place['回答'] == '是']
    # if len(err) > 0:
    #     # for t in error_places:
    #     #     keyword = t['placeName']
    #     #     # 查找包含关键字的段落
    #     #     paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
    #     #     t["yuanwen"] = paragraphs[0]
    #     return err
    # else:
    return err
 # from flask import Flask, request, jsonify
 # import os
 # # from checkPlaceName import checkPlaceName
 # # from checkRepeatText import checkRepeatText
 # # from checkCompanyName import checkCompanyName
 # # from documentError import getDocumentError
 # app = Flask(__name__)
 # UPLOAD_FOLDER = 'uploads'
 # if not os.path.exists(UPLOAD_FOLDER):
 #     os.makedirs(UPLOAD_FOLDER)
 # @app.route('/upload', methods=['POST'])
 # def upload_file():
 #     if 'file' not in request.files:
 #         return jsonify({"error": "No file part"}), 400
 #     file = request.files['file']
 #     if file.filename == '':
 #         return jsonify({"error": "No selected file"}), 400
 #     if file:
 #         filename = file.filename
 #         file.save(os.path.join(UPLOAD_FOLDER,filename))
 #         return jsonify({"message": "File uploaded successfully"}), 200
 # # @app.route('/checkPlaceName/<filename>', methods=['GET'])
 # # def checkPlaceNameWeb(filename):
 # #     return checkPlaceName(filename)
 # # @app.route('/checkRepeatText/<filename>', methods=['GET'])
 # # def checkRepeatTextWeb(filename):
 # #     return checkRepeatText(filename)
 # # @app.route('/checkCompanyName/<filename>', methods=['GET'])
 # # def checkCompanyNameWeb(filename):
 # #     return checkCompanyName(filename)
 # # @app.route('/checkDocumentErrorWeb/<filename>', methods=['GET'])
 # # def checkDocumentErrorWeb(filename):
 # #     return getDocumentError(filename)
 # if __name__ == '__main__':
 #     app.run(host='0.0.0.0',port=80)
 # from transformers import AutoTokenizer, AutoModel, GenerationConfig,AutoModelForCausalLM
 # import os
 # os.environ['NPU_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7'
 # os.environ['ASCEND_RT_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7'
 # import torch
 # import torch_npu
 # from torch_npu.contrib import transfer_to_npu
 # from accelerate import Accelerator
 # # device = 'cpu'
 # accelerator = Accelerator()
 # # torch_device = "npu" # 0~7
 # # torch.npu.set_device(torch.device(torch_device))
 # devices = []
 # for i in range(8):
 #     devices.append(f"npu:{i}")
 # print(devices)
 # torch.npu.set_device(devices)
 # torch.npu.set_compile_mode(jit_compile=False)
 # model_name_or_path = '/mnt/sdc/qwen/Qwen2-72B-Instruct'
 # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
 # # model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, device_map="auto",torch_dtype=torch.float16)
 # model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, device_map=accelerator,torch_dtype=torch.float16).npu().eval()
--- a/服务器文件/checkPlaceName.py
+++ b/服务器文件/checkPlaceName.py
@ -0,0 +1,153 @@
 from docx import Document
 from paddlenlp import Taskflow
 from pprint import pprint
 from qwen_agent.agents import Assistant
 import re
 import json_repair
 import time
 tagTask = Taskflow("ner")
 prompt='''
 .上述文本判断地名是否正确，你可以使用工具利用互联网查询，你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 不做过多的解释,严格按回答格式作答;
 '''
 # prompt='''
 # .请回答以上问题，
 # ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}]，不做过多的解释,严格按回答格式作答;
 # 不做过多的解释,严格按回答格式作答;
 # '''
 llm_cfg = {
    #'model': 'qwen1.5-72b-chat',
    'model':"qwen2-72b",
    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base
    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
 }
 bot = Assistant(llm=llm_cfg,
                name='Assistant',
                # description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'
                )
 #获取全文内容
 def getDocxToTextAll(name):
    docxPath=name
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            # print("非空")
            words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    print("placeNameTask",len(words))
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)
 #得到全文和地名有关的内容
 def placeNameTask(text):
    res = tagTask(text)
    print(res)
    placeList = []
    isplace = False
    for zuhe in res:
        # 上一个的地名,这一个还是地名，就和上一个相加代替这个
        if isplace:
            name = placeList[len(placeList) - 1]
            if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0:# or zuhe[1] == "ns"
                isplace = True
                new_text = zuhe[0].replace("\n", "")
                placeList[len(placeList) - 1] = name + new_text
                continue
        if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0:
            isplace = True
            new_text = zuhe[0].replace("\n", "")
            placeList.append(new_text)
        else:
            isplace = False
    placeList=list(dict.fromkeys(placeList))
    return placeList
 #主方法
 def checkPlaceName(filename):
    getDocxToTextAll(filename)
    start_time=time.time()
    error_places = []
    for batch in read_file_in_batches('checkPlaceName.txt'):
        res=process_batch(batch)
        if(len(res)>0):
            error_places.extend(res)
    pprint(error_places)
    end_time = time.time()
    # 计算执行时间
    elapsed_time = end_time - start_time
    print(f"checkPlaceName程序执行时间: {elapsed_time} 秒")
    return error_places
 def read_file_in_batches(file_path, batch_size=5000):
    """
    分批读取文本文件
    :param file_path: 文件路径
    :param batch_size: 每批处理的字符数
    :return: 生成器，每次返回一批文本
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        batch = []
        char_count = 0
        for line in file:
            batch.append(line)
            char_count += len(line)
            if char_count >= batch_size:
                yield ''.join(batch)
                batch = []
                char_count = 0
        if batch:
            yield ''.join(batch)
 def process_batch(batch):
    """
    处理一批文本
    :param batch: 一批文本
    """
    # 在这里添加你的处理逻辑
    # sentences = re.split(r'[。\n]', batch)
    # sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    propnList=placeNameTask(batch)
    # words=[]
    # for placeName in propnList:
    #     word="原文：{},先从分析原文是否含有错误地名，若含有错误地名，请回答包含错误地名，若不包含错误地名，请从【具体的公司或组织名称,非具体的公司或组织名称,与政府有关的公司或组织名称,其他组织名称，地名】中选择最合适的一个作为答案".format(placeName)
    #     words.append(word)  
    propnStr = ",".join(propnList)
    print("placeNameTask",propnStr)
    messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
    runList = []
    for rsp in bot.run(messages):
        runList.append(rsp)
    data = runList[len(runList) - 1][0]["content"]
    print("placeNameTask",data)
    parsed_data = json_repair.loads(data.replace('`', ''))
    # 遍历列表
    for item in parsed_data:
        print(f"地名: {item['placeName']}, 回答: {item['回答']}")
    # 如果需要进一步操作，例如只关注“正确”的回答
    error_places = [place for place in parsed_data if place['回答'] == '错误']
    print("placeNameTask",error_places)
    if len(error_places)>0:
        for t in error_places:
            keyword= t['placeName']
        # 查找包含关键字的段落
            paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch)
            t["yuanwen"]=paragraphs[0]
        return error_places
    else:
        return error_places
--- a/服务器文件/checkRepeatText.py
+++ b/服务器文件/checkRepeatText.py
@ -0,0 +1,160 @@
 import uuid
 from langchain_chroma import Chroma
 from langchain_community.embeddings import DashScopeEmbeddings
 from langchain_community.document_loaders import TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from paddlenlp import Taskflow
 similarity = Taskflow("text_similarity" , truncation=True,max_length=102400)
 embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
 vector_store_path="vector_store"
 vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
 import re
 import time
 from docx import Document
 # 记录程序开始的时间戳
 def getOutlineLevel(inputXml):
    """
    功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
    参数 inputXml
    返回 number
    """
    start_index = inputXml.find('<w:outlineLvl')
    end_index = inputXml.find('>', start_index)
    number = inputXml[start_index:end_index + 1]
    number = re.search("\d+", number).group()
    return number
 def isTitle(paragraph):
    """
    功能 判断该段落是否设置了大纲等级
    参数 paragraph:段落
    返回 None:普通正文，没有大纲级别 0:一级标题 1:二级标题 2:三级标题
    """
    # 如果是空行，直接返回None
    if paragraph.text.strip() == '':
        return None
    # 如果该段落是直接在段落里设置大纲级别的，根据xml判断大纲级别
    paragraphXml = paragraph._p.xml
    if paragraphXml.find('<w:outlineLvl') >= 0:
        return getOutlineLevel(paragraphXml)
    # 如果该段落是通过样式设置大纲级别的，逐级检索样式及其父样式，判断大纲级别
    targetStyle = paragraph.style
    while targetStyle is not None:
        # 如果在该级style中找到了大纲级别，返回
        if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
            return getOutlineLevel(targetStyle.element.xml)
        else:
            targetStyle = targetStyle.base_style
    # 如果在段落、样式里都没有找到大纲级别，返回None
    return None
 #获取文档中 详细设计方案 章节的所有内容
 def getDocxToText(docxPath,titleName):
    document = Document(docxPath)
    # 逐段读取docx文档的内容
    levelList=[]
    words=[]
    addStart = False
    levelText=""
    i = 0
    for paragraph in document.paragraphs:
        # 判断该段落的标题级别
        # 这里用isTitle()临时代表，具体见下文介绍的方法
        text = paragraph.text
        if text.strip():#非空判断
            print("非空")
            if titleName:
                level = isTitle(paragraph)
                if(addStart and level=="0"):
                    addStart=False
                if(level=="0" and text.find(titleName)>=0):
                    addStart=True
                if level:
                    levelList.append("{}：".format(level)+paragraph.text)
                    levelText=text
                else:
                    if addStart:
                        if(text.startswith("图") or text.startswith("注：")):
                            continue
                i=i+1
                words.append("第{}个段落：".format(i)+text)
            else:
                words.append(text)
    # 将所有段落文本拼接成一个字符串，并用换行符分隔
    print("checkRepeatText",len(words))
    if len(words)==0:
        raise Exception("I know python!")
    text = '\n'.join(words)
    # 将文本写入txt文件
    with open("checkRepeatText.txt", 'w', ) as txt_file:
        txt_file.write(text)
    time.sleep(3)
    loader = TextLoader(file_path='checkRepeatText.txt')
    docs = loader.load()
    # print(docs)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10, add_start_index=True,
                                                   separators=["\n\n", "\n"])
    splits = text_splitter.split_documents(docs)
    uuids = []
    print(len(splits))
    for i in range(len(splits)):
        uuids.append(str(uuid.uuid4()))
    print(len(uuids))
    vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
    vectorstore.add_documents(documents=splits, ids=uuids)
    while True:
        time.sleep(0.3)
        ress = vectorstore.similarity_search(words[0])
        if (len(ress) > 0):
            break
    return  words,uuids
 # @app.route('/checkRepeatText/<filename>', methods=['GET'])
 def checkRepeatText(filename,titleName):
    words,uuids=getDocxToText(filename,titleName)
    try:
    # 记录程序开始的时间戳‘
        reslist = []
        count = 0
        for i in words:
            count += 1
            result = vectorstore.similarity_search(i)
            textTag = i.split("：")[0]
            print(i)
            for content in result:
                text = content.page_content
                tag = text.split("：")[0].replace('\n', '')
                if (textTag.find(tag) >= 0):
                    continue
                res = similarity([[i[i.find('：') + 1:], text[text.find('：') + 1:]]])
                print(res[0]["similarity"])
                if (res[0]["similarity"] > 0.95):
                    # 判断重复内容是否被放入
                    if (len(reslist) > 0):
                        isExist = False
                        for neirong in reslist:
                            if i[i.find('：') + 1:] in neirong.values():
                                isExist = True
                                break
                        if not isExist:
                            reslist.append({"yuanwen1":i[i.find('：') + 1:],"yuanwen2":text[text.find('：') + 1:]})
                            print(reslist)
                    else:
                        reslist.append({"yuanwen1":i[i.find('：') + 1:],"yuanwen2":text[text.find('：') + 1:]})
                        print(i.split("：")[1] + "\n" + text.split("：")[1])
    except Exception as e:
        print("发生异常:",e)
    finally:
        # if(count>=300):
        #     break
        vectorstore.delete(ids=uuids)
        print("已删除")
    print(reslist)
    return reslist
--- a/服务器文件/json_repair.py
+++ b/服务器文件/json_repair.py
@ -0,0 +1,712 @@
 """
 This module will parse the JSON file following the BNF definition:
    <json> ::= <container>
    <primitive> ::= <number> | <string> | <boolean>
    ; Where:
    ; <number> is a valid real number expressed in one of a number of given formats
    ; <string> is a string of valid characters enclosed in quotes
    ; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
    <container> ::= <object> | <array>
    <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
    <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
    <member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
 If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
 - Add the missing parentheses if the parser believes that the array or object should be closed
 - Quote strings or add missing single quotes
 - Adjust whitespaces and remove line breaks
 All supported use cases are in the unit tests
 """
 import os
 import json
 from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
 class StringFileWrapper:
    # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
    def __init__(self, fd: TextIO) -> None:
        self.fd = fd
        self.length: int = 0
    def __getitem__(self, index: Union[int, slice]) -> str:
        if isinstance(index, slice):
            self.fd.seek(index.start)
            value = self.fd.read(index.stop - index.start)
            self.fd.seek(index.start)
            return value
        else:
            self.fd.seek(index)
            return self.fd.read(1)
    def __len__(self) -> int:
        if self.length < 1:
            current_position = self.fd.tell()
            self.fd.seek(0, os.SEEK_END)
            self.length = self.fd.tell()
            self.fd.seek(current_position)
        return self.length
 class LoggerConfig:
    # This is a type class to simplify the declaration
    def __init__(self, log_level: Optional[str]):
        self.log: List[Dict[str, str]] = []
        self.window: int = 10
        self.log_level: str = log_level if log_level else "none"
 JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
 class JSONParser:
    def __init__(
        self,
        json_str: Union[str, StringFileWrapper],
        json_fd: Optional[TextIO],
        logging: Optional[bool],
    ) -> None:
        # The string to parse
        self.json_str = json_str
        # Alternatively, the file description with a json file in it
        if json_fd:
            # This is a trick we do to treat the file wrapper as an array
            self.json_str = StringFileWrapper(json_fd)
        # Index is our iterator that will keep track of which character we are looking at right now
        self.index: int = 0
        # This is used in the object member parsing to manage the special cases of missing quotes in key or value
        self.context: list[str] = []
        # Use this to log the activity, but only if logging is active
        self.logger = LoggerConfig(log_level="info" if logging else None)
    def parse(
        self,
    ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
        json = self.parse_json()
        if self.index < len(self.json_str):
            self.log(
                "The parser returned early, checking if there's more json elements",
                "info",
            )
            json = [json]
            last_index = self.index
            while self.index < len(self.json_str):
                j = self.parse_json()
                if j != "":
                    json.append(j)
                if self.index == last_index:
                    self.index += 1
                last_index = self.index
            # If nothing extra was found, don't return an array
            if len(json) == 1:
                self.log(
                    "There were no more elements, returning the element without the array",
                    "info",
                )
                json = json[0]
        if self.logger.log_level == "none":
            return json
        else:
            return json, self.logger.log
    def parse_json(
        self,
    ) -> JSONReturnType:
        while True:
            char = self.get_char_at()
            # This parser will ignore any basic element (string or number) that is not inside an array or object
            is_in_context = len(self.context) > 0
            # False means that we are at the end of the string provided
            if char is False:
                return ""
            # <object> starts with '{'
            elif char == "{":
                self.index += 1
                return self.parse_object()
            # <array> starts with '['
            elif char == "[":
                self.index += 1
                return self.parse_array()
            # there can be an edge case in which a key is empty and at the end of an object
            # like "key": }. We return an empty string here to close the object properly
            elif char == "}":
                self.log(
                    "At the end of an object we found a key with missing value, skipping",
                    "info",
                )
                return ""
            # <string> starts with a quote
            elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
                return self.parse_string()
            # <number> starts with [0-9] or minus
            elif is_in_context and (char.isdigit() or char == "-" or char == "."):
                return self.parse_number()
            # If everything else fails, we just ignore and move on
            else:
                self.index += 1
    def parse_object(self) -> Dict[str, Any]:
        # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
        obj = {}
        # Stop when you either find the closing parentheses or you have iterated over the entire string
        while (self.get_char_at() or "}") != "}":
            # This is what we expect to find:
            # <member> ::= <string> ': ' <json>
            # Skip filler whitespaces
            self.skip_whitespaces_at()
            # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
            if (self.get_char_at() or "") == ":":
                self.log(
                    "While parsing an object we found a : before a key, ignoring",
                    "info",
                )
                self.index += 1
            # We are now searching for they string key
            # Context is used in the string parser to manage the lack of quotes
            self.set_context("object_key")
            self.skip_whitespaces_at()
            # <member> starts with a <string>
            key = ""
            while self.get_char_at():
                key = str(self.parse_string())
                if key != "" or (key == "" and self.get_char_at() == ":"):
                    # If the string is empty but there is a object divider, we are done here
                    break
            self.skip_whitespaces_at()
            # We reached the end here
            if (self.get_char_at() or "}") == "}":
                continue
            self.skip_whitespaces_at()
            # An extreme case of missing ":" after a key
            if (self.get_char_at() or "") != ":":
                self.log(
                    "While parsing an object we missed a : after a key",
                    "info",
                )
            self.index += 1
            self.reset_context()
            self.set_context("object_value")
            # The value can be any valid json
            value = self.parse_json()
            # Reset context since our job is done
            self.reset_context()
            obj[key] = value
            if (self.get_char_at() or "") in [",", "'", '"']:
                self.index += 1
            # Remove trailing spaces
            self.skip_whitespaces_at()
        self.index += 1
        return obj
    def parse_array(self) -> List[Any]:
        # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
        arr = []
        self.set_context("array")
        # Stop when you either find the closing parentheses or you have iterated over the entire string
        while (self.get_char_at() or "]") != "]":
            self.skip_whitespaces_at()
            value = self.parse_json()
            # It is possible that parse_json() returns nothing valid, so we stop
            if value == "":
                break
            if value == "..." and self.get_char_at(-1) == ".":
                self.log(
                    "While parsing an array, found a stray '...'; ignoring it", "info"
                )
            else:
                arr.append(value)
            # skip over whitespace after a value but before closing ]
            char = self.get_char_at()
            while char and (char.isspace() or char == ","):
                self.index += 1
                char = self.get_char_at()
        # Especially at the end of an LLM generated json you might miss the last "]"
        char = self.get_char_at()
        if char and char != "]":
            self.log(
                "While parsing an array we missed the closing ], adding it back", "info"
            )
            self.index -= 1
        self.index += 1
        self.reset_context()
        return arr
    def parse_string(self) -> Union[str, bool, None]:
        # <string> is a string of valid characters enclosed in quotes
        # i.e. { name: "John" }
        # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
        # Flag to manage corner cases related to missing starting quote
        missing_quotes = False
        doubled_quotes = False
        lstring_delimiter = rstring_delimiter = '"'
        char = self.get_char_at()
        # A valid string can only start with a valid quote or, in our case, with a literal
        while char and char not in ['"', "'", "“"] and not char.isalnum():
            self.index += 1
            char = self.get_char_at()
        if not char:
            # This is an empty string
            return ""
        # Ensuring we use the right delimiter
        if char == "'":
            lstring_delimiter = rstring_delimiter = "'"
        elif char == "“":
            lstring_delimiter = "“"
            rstring_delimiter = "”"
        elif char.isalnum():
            # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
            # But remember, object keys are only of type string
            if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
                value = self.parse_boolean_or_null()
                if value != "":
                    return value
            self.log(
                "While parsing a string, we found a literal instead of a quote",
                "info",
            )
            self.log(
                "While parsing a string, we found no starting quote. Will add the quote back",
                "info",
            )
            missing_quotes = True
        if not missing_quotes:
            self.index += 1
        # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
        if self.get_char_at() == lstring_delimiter:
            # If it's an empty key, this was easy
            if self.get_context() == "object_key" and self.get_char_at(1) == ":":
                self.index += 1
                return ""
            # Find the next delimiter
            i = 1
            next_c = self.get_char_at(i)
            while next_c and next_c != rstring_delimiter:
                i += 1
                next_c = self.get_char_at(i)
            # Now check that the next character is also a delimiter to ensure that we have "".....""
            # In that case we ignore this rstring delimiter
            if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
                self.log(
                    "While parsing a string, we found a valid starting doubled quote, ignoring it",
                    "info",
                )
                doubled_quotes = True
                self.index += 1
            else:
                # Ok this is not a doubled quote, check if this is an empty string or not
                i = 1
                next_c = self.get_char_at(i)
                while next_c and next_c.isspace():
                    i += 1
                    next_c = self.get_char_at(i)
                if next_c not in [",", "]", "}"]:
                    self.log(
                        "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
                        "info",
                    )
                    self.index += 1
        # Initialize our return value
        string_acc = ""
        # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
        # In that case we need to use the ":|,|}" characters as terminators of the string
        # So this will stop if:
        # * It finds a closing quote
        # * It iterated over the entire sequence
        # * If we are fixing missing quotes in an object, when it finds the special terminators
        char = self.get_char_at()
        while char and char != rstring_delimiter:
            if missing_quotes:
                if self.get_context() == "object_key" and (
                    char == ":" or char.isspace()
                ):
                    self.log(
                        "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
                        "info",
                    )
                    break
                elif self.get_context() == "object_value" and char in [",", "}"]:
                    rstring_delimiter_missing = True
                    # check if this is a case in which the closing comma is NOT missing instead
                    i = 1
                    next_c = self.get_char_at(i)
                    while next_c and next_c != rstring_delimiter:
                        i += 1
                        next_c = self.get_char_at(i)
                    if next_c:
                        i += 1
                        next_c = self.get_char_at(i)
                        # found a delimiter, now we need to check that is followed strictly by a comma or brace
                        while next_c and next_c.isspace():
                            i += 1
                            next_c = self.get_char_at(i)
                        if next_c and next_c in [",", "}"]:
                            rstring_delimiter_missing = False
                    if rstring_delimiter_missing:
                        self.log(
                            "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
                            "info",
                        )
                        break
            string_acc += char
            self.index += 1
            char = self.get_char_at()
            if char and len(string_acc) > 0 and string_acc[-1] == "\\":
                # This is a special case, if people use real strings this might happen
                self.log("Found a stray escape sequence, normalizing it", "info")
                string_acc = string_acc[:-1]
                if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
                    escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
                    string_acc += escape_seqs.get(char, char) or char
                    self.index += 1
                    char = self.get_char_at()
            # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
            if char == rstring_delimiter:
                # Special case here, in case of double quotes one after another
                if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
                    self.log(
                        "While parsing a string, we found a doubled quote, ignoring it",
                        "info",
                    )
                    self.index += 1
                elif missing_quotes and self.get_context() == "object_value":
                    # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
                    i = 1
                    next_c = self.get_char_at(i)
                    while next_c and next_c not in [
                        rstring_delimiter,
                        lstring_delimiter,
                    ]:
                        i += 1
                        next_c = self.get_char_at(i)
                    if next_c:
                        # We found a quote, now let's make sure there's a ":" following
                        i += 1
                        next_c = self.get_char_at(i)
                        # found a delimiter, now we need to check that is followed strictly by a comma or brace
                        while next_c and next_c.isspace():
                            i += 1
                            next_c = self.get_char_at(i)
                        if next_c and next_c == ":":
                            # Reset the cursor
                            self.index -= 1
                            char = self.get_char_at()
                            self.log(
                                "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
                                "info",
                            )
                            break
                else:
                    # Check if eventually there is a rstring delimiter, otherwise we bail
                    i = 1
                    next_c = self.get_char_at(i)
                    check_comma_in_object_value = True
                    while next_c and next_c not in [
                        rstring_delimiter,
                        lstring_delimiter,
                    ]:
                        # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
                        # This is because the routine after will make sure to correct any bad guess and this solves a corner case
                        if check_comma_in_object_value and next_c.isalpha():
                            check_comma_in_object_value = False
                        # If we are in an object context, let's check for the right delimiters
                        if (
                            ("object_key" in self.context and next_c in [":", "}"])
                            or ("object_value" in self.context and next_c == "}")
                            or ("array" in self.context and next_c in ["]", ","])
                            or (
                                check_comma_in_object_value
                                and self.get_context() == "object_value"
                                and next_c == ","
                            )
                        ):
                            break
                        i += 1
                        next_c = self.get_char_at(i)
                    # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
                    if next_c == "," and self.get_context() == "object_value":
                        i += 1
                        next_c = self.get_char_at(i)
                        while next_c and next_c != rstring_delimiter:
                            i += 1
                            next_c = self.get_char_at(i)
                        # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
                        i += 1
                        next_c = self.get_char_at(i)
                        while next_c and next_c.isspace():
                            i += 1
                            next_c = self.get_char_at(i)
                        if next_c == "}":
                            # OK this is valid then
                            self.log(
                                "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
                                "info",
                            )
                            string_acc += str(char)
                            self.index += 1
                            char = self.get_char_at()
                    elif next_c == rstring_delimiter:
                        if self.get_context() == "object_value":
                            # But this might not be it! This could be just a missing comma
                            # We found a delimiter and we need to check if this is a key
                            # so find a rstring_delimiter and a colon after
                            i += 1
                            next_c = self.get_char_at(i)
                            while next_c and next_c != rstring_delimiter:
                                i += 1
                                next_c = self.get_char_at(i)
                            i += 1
                            next_c = self.get_char_at(i)
                            while next_c and next_c != ":":
                                if next_c in [
                                    lstring_delimiter,
                                    rstring_delimiter,
                                    ",",
                                ]:
                                    break
                                i += 1
                                next_c = self.get_char_at(i)
                            # Only if we fail to find a ':' then we know this is misplaced quote
                            if next_c != ":":
                                self.log(
                                    "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
                                    "info",
                                )
                                string_acc += str(char)
                                self.index += 1
                                char = self.get_char_at()
        if (
            char
            and missing_quotes
            and self.get_context() == "object_key"
            and char.isspace()
        ):
            self.log(
                "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
                "info",
            )
            self.skip_whitespaces_at()
            if self.get_char_at() not in [":", ","]:
                return ""
        # A fallout of the previous special case in the while loop,
        # we need to update the index only if we had a closing quote
        if char != rstring_delimiter:
            self.log(
                "While parsing a string, we missed the closing quote, ignoring",
                "info",
            )
        else:
            self.index += 1
        return string_acc.rstrip()
    def parse_number(self) -> Union[float, int, str, JSONReturnType]:
        # <number> is a valid real number expressed in one of a number of given formats
        number_str = ""
        number_chars = set("0123456789-.eE/,")
        char = self.get_char_at()
        is_array = self.get_context() == "array"
        while char and char in number_chars and (char != "," or not is_array):
            number_str += char
            self.index += 1
            char = self.get_char_at()
        if len(number_str) > 1 and number_str[-1] in "-eE/,":
            # The number ends with a non valid character for a number/currency, rolling back one
            number_str = number_str[:-1]
            self.index -= 1
        try:
            if "," in number_str:
                return str(number_str)
            if "." in number_str or "e" in number_str or "E" in number_str:
                return float(number_str)
            elif number_str == "-":
                # If there is a stray "-" this will throw an exception, throw away this character
                return self.parse_json()
            else:
                return int(number_str)
        except ValueError:
            return number_str
    def parse_boolean_or_null(self) -> Union[bool, str, None]:
        # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
        starting_index = self.index
        char = (self.get_char_at() or "").lower()
        value: Optional[Tuple[str, Optional[bool]]]
        if char == "t":
            value = ("true", True)
        elif char == "f":
            value = ("false", False)
        elif char == "n":
            value = ("null", None)
        if value:
            i = 0
            while char and i < len(value[0]) and char == value[0][i]:
                i += 1
                self.index += 1
                char = (self.get_char_at() or "").lower()
            if i == len(value[0]):
                return value[1]
        # If nothing works reset the index before returning
        self.index = starting_index
        return ""
    def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
        # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
        try:
            return self.json_str[self.index + count]
        except IndexError:
            return False
    def skip_whitespaces_at(self) -> None:
        """
        This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
        """
        try:
            char = self.json_str[self.index]
        except IndexError:
            return
        while char.isspace():
            self.index += 1
            try:
                char = self.json_str[self.index]
            except IndexError:
                return
    def set_context(self, value: str) -> None:
        # If a value is provided update the context variable and save in stack
        if value:
            self.context.append(value)
    def reset_context(self) -> None:
        self.context.pop()
    def get_context(self) -> str:
        return self.context[-1]
    def log(self, text: str, level: str) -> None:
        if level == self.logger.log_level:
            context = ""
            start = max(self.index - self.logger.window, 0)
            end = min(self.index + self.logger.window, len(self.json_str))
            context = self.json_str[start:end]
            self.logger.log.append(
                {
                    "text": text,
                    "context": context,
                }
            )
 def repair_json(
    json_str: str = "",
    return_objects: bool = False,
    skip_json_loads: bool = False,
    logging: bool = False,
    json_fd: Optional[TextIO] = None,
    ensure_ascii: bool = True,
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
    It will return the fixed string by default.
    When `return_objects=True` is passed, it will return the decoded data structure instead.
    When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
    When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
    """
    parser = JSONParser(json_str, json_fd, logging)
    if skip_json_loads:
        parsed_json = parser.parse()
    else:
        try:
            if json_fd:
                parsed_json = json.load(json_fd)
            else:
                parsed_json = json.loads(json_str)
        except json.JSONDecodeError:
            parsed_json = parser.parse()
    # It's useful to return the actual object instead of the json string,
    # it allows this lib to be a replacement of the json library
    if return_objects or logging:
        return parsed_json
    return json.dumps(parsed_json, ensure_ascii=ensure_ascii)
 def loads(
    json_str: str,
    skip_json_loads: bool = False,
    logging: bool = False,
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    This function works like `json.loads()` except that it will fix your JSON in the process.
    It is a wrapper around the `repair_json()` function with `return_objects=True`.
    """
    return repair_json(
        json_str=json_str,
        return_objects=True,
        skip_json_loads=skip_json_loads,
        logging=logging,
    )
 def load(
    fd: TextIO, skip_json_loads: bool = False, logging: bool = False
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    This function works like `json.load()` except that it will fix your JSON in the process.
    It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
    """
    return repair_json(
        json_fd=fd,
        return_objects=True,
        skip_json_loads=skip_json_loads,
        logging=logging,
    )
 def from_file(
    filename: str,
    skip_json_loads: bool = False,
    logging: bool = False,
 ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
    """
    This function is a wrapper around `load()` so you can pass the filename as string
    """
    fd = open(filename)
    jsonobj = load(fd, skip_json_loads, logging)
    fd.close()
    return jsonobj
--- a/服务器文件/main.py
+++ b/服务器文件/main.py
@ -0,0 +1,45 @@
 from flask import Flask, request, jsonify
 import os
 from checkPlaceName import checkPlaceName
 # from checkRepeatText import checkRepeatText
 from checkCompanyName import checkCompanyName
 from checkDocumentError import getDocumentError
 app = Flask(__name__)
 UPLOAD_FOLDER = 'uploads'
 if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)
@app.route('/upload', methods=['POST'])
 def upload_file():
    if 'file' not in request.files:
        return jsonify({"error": "No file part"}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({"error": "No selected file"}), 400
    if file:
        filename = file.filename
        file.save(os.path.join(UPLOAD_FOLDER,filename))
        return jsonify({"message": "File uploaded successfully"}), 200
@app.route('/getDocumentError', methods=['GET'])
 def getDocumentErrorWeb():
    filename = request.args.get('filename')
    return getDocumentError(filename)
@app.route('/checkPlaceName', methods=['GET'])
 def checkPlaceNameWeb():
    filename = request.args.get('filename')
    return checkPlaceName(filename)
@app.route('/checkRepeatText', methods=['GET'])
 def checkRepeatTextWeb():
    filename = request.args.get('filename')
    sectionName=request.args.get('sectionName')
    return checkRepeatText(filename,sectionName)
@app.route('/checkCompanyName', methods=['GET'])
 def checkCompanyNameWeb():
    filename = request.args.get('filename')
    return checkCompanyName(filename)
@app.route('/test/<filename>', methods=['GET'])
 def test(filename):
    return filename
 if __name__ == '__main__':
    app.run(host="0.0.0.0",port=80)
	`@ -0,0 +1,3 @@`

					`from ipykernel import kernelapp as app`
					`app.launch_new_instance()`