Browse Source

first commit

master
zhouhaibin 5 months ago
commit
6639ac75dc
  1. 8
      .idea/.gitignore
  2. 6
      .idea/encodings.xml
  3. 6
      .idea/inspectionProfiles/profiles_settings.xml
  4. 7
      .idea/misc.xml
  5. 8
      .idea/modules.xml
  6. 10
      .idea/python项目39.iml
  7. BIN
      __pycache__/baidusearch.cpython-39.pyc
  8. BIN
      __pycache__/checkCompanyName.cpython-39.pyc
  9. BIN
      __pycache__/checkDocumentError.cpython-39.pyc
  10. BIN
      __pycache__/checkPlaceName.cpython-39.pyc
  11. BIN
      __pycache__/checkRepeatText.cpython-39.pyc
  12. BIN
      __pycache__/json_repair.cpython-39.pyc
  13. BIN
      __pycache__/main.cpython-39.pyc
  14. BIN
      __pycache__/qwen_agenttext.cpython-39.pyc
  15. BIN
      __pycache__/test.cpython-39.pyc
  16. 258
      baidusearch.py
  17. 64
      cewenj.py
  18. 205
      checkCompanyName.py
  19. 1371
      checkCompanyName.txt
  20. 220
      checkDocumentError.py
  21. 212
      checkPlaceName.py
  22. 292
      checkRepeatText.py
  23. 173
      checkTitleName.py
  24. 176
      daijian方案.py
  25. 712
      json_repair.py
  26. 161
      main.py
  27. 132
      qwen_agenttext.py
  28. 109
      test.py
  29. BIN
      workspace/1.png
  30. BIN
      workspace/image14.png
  31. BIN
      workspace/image15.png
  32. BIN
      workspace/image16.png
  33. BIN
      workspace/image17.png
  34. BIN
      workspace/image18.png
  35. BIN
      workspace/image19.png
  36. BIN
      workspace/image20.png
  37. BIN
      workspace/tools/code_interpreter/05613c9c-c910-455d-8c8b-62b7dc243b2a.png
  38. BIN
      workspace/tools/code_interpreter/1560f103-f2dc-49e3-88c2-35f5d500bc1d.png
  39. BIN
      workspace/tools/code_interpreter/4aa3a1fe-7fc2-440f-8bd9-653ee1721776.png
  40. BIN
      workspace/tools/code_interpreter/54b7ad57-9c89-4977-b49a-eaf7e60b9656.png
  41. BIN
      workspace/tools/code_interpreter/c8cba059-ac85-42b0-b197-1c8e1e7182c9.png
  42. 12
      workspace/tools/code_interpreter/kernel_connection_file_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.json
  43. 12
      workspace/tools/code_interpreter/kernel_connection_file_113f0326-0345-475c-85c1-86af71d668c0_24876.json
  44. 12
      workspace/tools/code_interpreter/kernel_connection_file_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.json
  45. 12
      workspace/tools/code_interpreter/kernel_connection_file_a3131ded-afec-43fa-95eb-d2f35548a411_39868.json
  46. 12
      workspace/tools/code_interpreter/kernel_connection_file_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.json
  47. 12
      workspace/tools/code_interpreter/kernel_connection_file_d624f7a6-914d-48c1-b902-4e298f92b671_20484.json
  48. 12
      workspace/tools/code_interpreter/kernel_connection_file_ec74ca73-6455-4a78-96b1-542747f19a25_39260.json
  49. 3
      workspace/tools/code_interpreter/launch_kernel_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.py
  50. 3
      workspace/tools/code_interpreter/launch_kernel_113f0326-0345-475c-85c1-86af71d668c0_24876.py
  51. 3
      workspace/tools/code_interpreter/launch_kernel_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.py
  52. 3
      workspace/tools/code_interpreter/launch_kernel_a3131ded-afec-43fa-95eb-d2f35548a411_39868.py
  53. 3
      workspace/tools/code_interpreter/launch_kernel_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.py
  54. 3
      workspace/tools/code_interpreter/launch_kernel_d624f7a6-914d-48c1-b902-4e298f92b671_20484.py
  55. 3
      workspace/tools/code_interpreter/launch_kernel_ec74ca73-6455-4a78-96b1-542747f19a25_39260.py
  56. BIN
      workspace/tools/code_interpreter/temp_image.png
  57. 1
      workspace/tools/doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_500
  58. 8699
      workspace/tools/simple_doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_ori
  59. 140
      代码段存储.py
  60. 118
      文档一二级标题识别与提取.py
  61. 282
      文档图片提取.py
  62. 133
      服务器文件/checkCompanyName.py
  63. 226
      服务器文件/checkDocumentError.py
  64. 153
      服务器文件/checkPlaceName.py
  65. 160
      服务器文件/checkRepeatText.py
  66. 712
      服务器文件/json_repair.py
  67. 45
      服务器文件/main.py

8
.idea/.gitignore

@ -0,0 +1,8 @@
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

6
.idea/encodings.xml

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/ce.txt" charset="GBK" />
</component>
</project>

6
.idea/inspectionProfiles/profiles_settings.xml

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.9 (venv) (2)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (venv) (2)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/python项目39.iml" filepath="$PROJECT_DIR$/.idea/python项目39.iml" />
</modules>
</component>
</project>

10
.idea/python项目39.iml

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

BIN
__pycache__/baidusearch.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/checkCompanyName.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/checkDocumentError.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/checkPlaceName.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/checkRepeatText.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/json_repair.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/main.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/qwen_agenttext.cpython-39.pyc

Binary file not shown.

BIN
__pycache__/test.cpython-39.pyc

Binary file not shown.

258
baidusearch.py

@ -0,0 +1,258 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by Charles on 2018/10/10
# Function:
import sys
import requests
from bs4 import BeautifulSoup
ABSTRACT_MAX_LENGTH = 300 # abstract max length
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 '
'(KHTML, like Gecko) QtWeb Internet Browser/3.7 http://www.QtWeb.net',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, '
'like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) '
'Gecko/20070404 K-Ninja/2.1.3',
'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; '
'en-US) iNet Browser 4.7',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) '
'Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866'
]
# 请求头信息
HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Content-Type": "application/x-www-form-urlencoded",
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
"Referer": "https://www.baidu.com/",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9"
}
baidu_host_url = "https://www.baidu.com"
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&wd="
session = requests.Session()
session.headers = HEADERS
def search(keyword, num_results=10, debug=0):
"""
通过关键字进行搜索
:param keyword: 关键字
:param num_results 指定返回的结果个数
:return: 结果列表
"""
if not keyword:
return None
list_result = []
page = 1
# 起始搜索的url
next_url = baidu_search_url + keyword
# 循环遍历每一页的搜索结果,并返回下一页的url
while len(list_result) < num_results:
data, next_url = parse_html(next_url, rank_start=len(list_result))
if data:
list_result += data
if debug:
print("---searching[{}], finish parsing page {}, results number={}: ".format(keyword, page, len(data)))
for d in data:
print(str(d))
if not next_url:
if debug:
print(u"already search the last page。")
break
page += 1
if debug:
print("\n---search [{}] finished. total results number={}".format(keyword, len(list_result)))
return list_result[: num_results] if len(list_result) > num_results else list_result
def parse_html(url, rank_start=0, debug=0):
"""
解析处理结果
:param url: 需要抓取的 url
:return: 结果列表下一页的url
"""
try:
res = session.get(url=url)
res.encoding = "utf-8"
root = BeautifulSoup(res.text, "lxml")
list_data = []
div_contents = root.find("div", id="content_left")
for div in div_contents.contents:
if type(div) != type(div_contents):
continue
class_list = div.get("class", [])
if not class_list:
continue
if "c-container" not in class_list:
continue
title = ''
url = ''
abstract = ''
try:
# 遍历所有找到的结果,取得标题和概要内容(50字以内)
if "xpath-log" in class_list:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.text.strip().split("\n", 1)[0]
if div.a:
url = div.a['href'].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip().split("\n", 1)[1].strip()
elif "result-op" in class_list:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.text.strip().split("\n", 1)[0]
url = div.a['href'].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
# abstract = div.text.strip()
abstract = div.text.strip().split("\n", 1)[1].strip()
else:
if div.get("tpl", "") != "se_com_default":
if div.get("tpl", "") == "se_st_com_abstract":
if len(div.contents) >= 1:
title = div.h3.text.strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
else:
if len(div.contents) >= 2:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.contents[0].text.strip()
url = div.h3.a['href'].strip()
# abstract = div.contents[-1].text
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
else:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.contents[0].text.strip()
url = div.h3.a['href'].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
except Exception as e:
if debug:
print("catch exception duration parsing page html, e={}".format(e))
continue
if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
abstract = abstract[:ABSTRACT_MAX_LENGTH]
rank_start+=1
list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start})
# 找到下一页按钮
next_btn = root.find_all("a", class_="n")
# 已经是最后一页了,没有下一页了,此时只返回数据不再获取下一页的链接
if len(next_btn) <= 0 or u"上一页" in next_btn[-1].text:
return list_data, None
next_url = baidu_host_url + next_btn[-1]["href"]
return list_data, next_url
except Exception as e:
if debug:
print(u"catch exception duration parsing page html, e:{}".format(e))
return None, None
def run():
"""
主程序入口支持命令得带参执行或者手动输入关键字
:return:
"""
default_keyword = u"长风破浪小武哥"
num_results = 10
debug = 0
prompt = """
baidusearch: not enough arguments
[0]keyword: keyword what you want to search
[1]num_results: number of results
[2]debug: debug switch, 0-close, 1-open, default-0
eg: baidusearch NBA
baidusearch NBA 6
baidusearch NBA 8 1
"""
if len(sys.argv) > 3:
keyword = sys.argv[1]
try:
num_results = int(sys.argv[2])
debug = int(sys.argv[3])
except:
pass
elif len(sys.argv) > 1:
keyword = sys.argv[1]
else:
print(prompt)
keyword = input("please input keyword: ")
# sys.exit(1)
if not keyword:
keyword = default_keyword
print("---start search: [{}], expected number of results:[{}].".format(keyword, num_results))
results = search(keyword, num_results=num_results, debug=debug)
if isinstance(results, list):
print("search results:(total[{}]items.)".format(len(results)))
for res in results:
print("{}. {}\n {}\n {}".format(res['rank'], res["title"], res["abstract"], res["url"]))
else:
print("start search: [{}] failed.".format(keyword))
if __name__ == '__main__':
run()

64
cewenj.py

@ -0,0 +1,64 @@
from qwen_agent.agents import Assistant
# from qwen_agent.agents.doc_qa import ParallelDocQA
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
prompt='''
请找是描述项目建设的章节名称
'''
messages = [{'role': 'user', 'content': [{'text': prompt}, {'file': ''}]}]
for rsp in bot.run(messages):
print(rsp)
# messages = [{'role': 'user', 'content': [{'text':prompt}]}]
# runList=[]
# for rsp in bot.run(messages):
# print(rsp)
import re
# from docx import Document
#
# document = Document('747991ddb29a49da903210959076bb9f.docx')
# # 逐段读取docx文档的内容
# levelList = []
# words = []
# addStart = False
# levelText = ""
# i = 0
# for paragraph in document.paragraphs:
# # 判断该段落的标题级别
# # 这里用isTitle()临时代表,具体见下文介绍的方法
# text = paragraph.text
# if text.strip(): # 非空判断
# # print("非空")
# words.append(text)
# # level = isTitle(paragraph)
# # if(addStart and level=="0"):
# # addStart=False
# # if(level=="0" and text.find("详细设计方案")>=0):
# # addStart=True
# # if level:
# # levelList.append("{}:".format(level)+paragraph.text)
# # levelText=text
# # else:
# # if addStart:
# # if(text.startswith("图") or text.startswith("注:")):
# # continue
# # i=i+1
# # words.append("第{}个段落:".format(i)+text)
#
# # 将所有段落文本拼接成一个字符串,并用换行符分隔
# print(len(words))
# text = '\n'.join(words)
# paragraphs = re.findall(r'.*?' + re.escape('宁波市') + r'.*?\n', text)
# print(paragraphs)
from langchain_community.document_loaders import TextLoader
loader = TextLoader('checkRepeatText.txt')
docs = loader.load()

205
checkCompanyName.py

@ -0,0 +1,205 @@
# -*- coding:utf-8 -*-
import time
from docx import Document
from paddlenlp import Taskflow
from qwen_agent.agents import Assistant
import re
import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkCompanyName")
prompt = '''
.根据上述文本判断是否为具体的公司或组织名称你可以使用工具利用互联网查询
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校行业类型其他]选项中选择答案,
回答格式[{companyName名称,"回答":"答案"}{companyName名称,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
'''
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"
)
def getDocxToTextAll(name):
docxPath=name
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
def companyNameTask(text):
yield "文档公司或组织名称检查---启动中...."
wordtag = Taskflow("knowledge_mining",device_id=0)
batchNum=20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数
total_chars = len(sentences)
# 计算有多少份
num_chunks = math.ceil(total_chars / batchNum)
# 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
placeList = []
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}"
wenBen=".".join(chunk)
try:
res = wordtag(wenBen)
except Exception as e:
logging.warning(chunk)
logging.warning("文档公司或组织名称检查---词类分析出错",e)
continue
isplace = False
for zuhe in res[0]['items']:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
zhi = zuhe.get("wordtag_label")
if isplace:
name = placeList[len(placeList) - 1]
if zhi.find("组织机构类") >= 0: # or zuhe[1] == "ns"
isplace = True
new_text = zuhe['item'].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
if zhi.find("组织机构类") >= 0:
isplace = True
new_text = zuhe['item'].replace("\n", "")
placeList.append(new_text)
else:
isplace = False
# 打印总份数
yield "文档公司或组织名称检查---文档解析完成"
placeList=list(dict.fromkeys(placeList))
yield placeList
def checkCompanyName(filename):
yield f"文档公司或组织名称检查---开始处理文档..."
try:
getDocxToTextAll(filename)
except Exception as e:
logging.warning(e)
yield "文档公司或组织名称检查---文档无法打开,请检查文档内容"
return
with open("checkCompanyName.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送
for item in companyNameTask(gettext):
if isinstance(item, str):
yield item
else:
final_list = item # 获取最终结果
propnStr = ",".join(final_list)
messages = [{'role': 'user', 'content': [{'text': propnStr+prompt}]}]
runList = []
yield f"文档公司或组织名称检查---结果生成中..." # 每次生成一个数字就发送
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
if cishu > 3:
cishu = 0
yield "文档公司或组织名称检查---结果生成中" + '.' * cishu
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
error_places=[]
for place in parsed_data:
try:
if place['回答'] == '非泛化的公司或组织名称':
error_places.append(place)
except Exception as e:
logging.warning(place)
logging.warning("文档公司或组织名称检查---组织提出出错",e)
continue
logging.info(error_places)
returnInfo = "发现异常公司或组织名称<br>"
if len(error_places)>0:
for t in error_places:
keyword= t['companyName'].replace("\n","")
# 查找包含关键字的段落
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
t["yuanwen"]=paragraphs[0]
yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n","")
returnInfo += "原文:" + yuanwen + "<br>异常公司或组织名称:**" + keyword + "**!请注意" + "<br>"
logging.info(returnInfo)
yield returnInfo
else:
yield "**未发现异常公司或组织名称**<br>"

1371
checkCompanyName.txt

File diff suppressed because it is too large

220
checkDocumentError.py

@ -0,0 +1,220 @@
# -*- coding:utf-8 -*-
# from pycorrector import MacBertCorrector
# m = MacBertCorrector("shibing624/macbert4csc-base-chinese")
from qwen_agent.agents import Assistant
from docx import Document
from pprint import pprint
import re
from paddlenlp import Taskflow
import json
import time
import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import asyncio
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkDocumentError")
llm_cfg = {
# 'model': 'qwen1.5-72b-chat',
'model': "qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
# prompt='''
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案,
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答;
# '''
prompt = '''
请回答以上问题[]选项中选择答案,原文内容标点符号保持不变如果有错请给出解析没有错则不用给解析
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}]不做过多的解释,严格按回答格式作答;
'''
def getDocxToTextAll(name):
docxPath = name
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList = []
words = []
addStart = False
levelText = ""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip(): # 非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
def getDocumentError(filename):
yield f"文档纠错---开始处理文档..."
try:
getDocxToTextAll(filename)
except Exception as e:
logger.warning(e)
yield "文档无法打开,请检查文档内容"
return
with open("checkDocumentError.txt", "r", encoding='utf-8') as f:
gettext = f.read()
yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送
final_list = []
for item in documentErrorTask(gettext):
if isinstance(item, str):
yield item
else:
final_list = item # 获取最终结果
resInfo = "发现错别字<br>"
if (len(final_list) > 0):
for i in final_list:
yuanwen = i["placeName"].replace("\n", "")
jianyi = i["jianyi"].replace("\n", "")
resInfo += "原文:" + yuanwen + "<br>建议:**" + jianyi + "**<br>"
yield resInfo
logger.info(resInfo)
else:
yield "**未发现错别字**"
def documentErrorTask(text):
"""
分批读取文本文件
:param file_path: 文件路径
:param batch_size: 每批处理的字符数
:return: 生成器每次返回一批文本
"""
yield "文档纠错---启动中...."
corrector = Taskflow("text_correction", device_id=1)
batchNum = 20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数
total_chars = len(sentences)
# 计算有多少份
num_chunks = math.ceil(total_chars / batchNum)
# 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
placeList = []
# 打印每一份的内容
err = []
for i, chunk in enumerate(chunks):
yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"
try:
res = corrector(chunk)
except Exception as e:
logger.warning(chunk)
logger.warning("文档纠错--错别字识别出错\n", e)
continue
lines_with_greeting = [place for place in res if len(place['errors']) > 0]
if len(lines_with_greeting) > 0:
num = 0
wenti = [] # 记录问题的数组
keyword_list = [] # 记录问题
for t in lines_with_greeting:
temp_errorWords = []
keyword = t['source']
keyword_list.append(keyword)
for item in t["errors"]:
for key, value in item['correction'].items():
temp_errorWords.append(key)
wenti.append(
"{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))
num += 1
words = "\n".join(wenti)
messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]
runList = []
yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
if cishu > 3:
cishu = 0
yield "文档纠错---内容解析中" + '.' * cishu
cishu += 1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))
resListerr = []
for place in parsed_data:
try:
if place['回答'] == '':
place["placeName"] = keyword_list[int(place["placeName"])]
place["jianyi"] = place["解析"]
resListerr.append(place)
except Exception as e:
logger.warning(parsed_data)
logger.warning(place)
logger.warning("文档纠错--错别字提取出错\n", e)
continue
if (len(resListerr) > 0):
err.extend(resListerr)
# 打印总份数
yield "文档地名检查---文档解析完成"
yield err

212
checkPlaceName.py

@ -0,0 +1,212 @@
from docx import Document
from paddlenlp import Taskflow
from pprint import pprint
from qwen_agent.agents import Assistant
import re
import json_repair
import time
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkPlaceName")
prompt='''
.上述文本判断地名是否正确你可以使用工具利用互联网查询你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{placeName:地名,"回答":"答案"},{placeName:地名,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
不做过多的解释,严格按回答格式作答;
'''
# prompt='''
# .请回答以上问题,
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答;
# 不做过多的解释,严格按回答格式作答;
# '''
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
#获取全文内容
def getDocxToTextAll(docxPath):
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
#得到全文和地名有关的内容
def placeNameTask(text):
yield "文档地名检查---启动中...."
tagTask = Taskflow("ner",device_id=2)
batchNum=20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数
total_chars = len(sentences)
# 计算有多少份
num_chunks = math.ceil(total_chars / batchNum)
# 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
placeList = []
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
wenBen=".".join(chunk)
try:
res = tagTask(wenBen)
except Exception as e:
logger.warning(chunk)
logger.warning("文档地名检查---解析地名出错",e)
continue
isplace = False
for zuhe in res:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
if isplace:
name = placeList[len(placeList) - 1]
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: # or zuhe[1] == "ns"
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0:
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList.append(new_text)
else:
isplace = False
# 打印总份数
yield "文档地名检查---文档解析完成"
placeList=list(dict.fromkeys(placeList))
yield placeList
#主方法
def checkPlaceName(filename):
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送
try:
getDocxToTextAll(filename)
except Exception as e:
logger.warning(e)
yield "文档地名检查---文档无法打开,请检查文档内容"
return
with open("checkPlaceName.txt", "r",encoding='utf-8') as f:
gettext = f.read()
yield f"文档地名检查---开始解析文档..." # 每次生成一个数字就发送
# propnList=placeNameTask(gettext)
for item in placeNameTask(gettext):
if isinstance(item, str):
yield item
else:
final_list = item # 获取最终结果
propnStr = ",".join(final_list)
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
runList = []
yield f"文档地名检查---结果生成中..." # 每次生成一个数字就发送
cishu=0
for rsp in bot.run(messages):
runList.append(rsp)
if cishu>3:
cishu=0
yield "文档地名检查---结果生成中"+'.'*cishu
cishu+=1
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
error_places=[]
# 如果需要进一步操作,例如只关注“正确”的回答
for place in parsed_data:
try:
if place['回答'] == '错误':
error_places.append(place)
except Exception as e:
logger.warning(place)
logger.warning("文档地名检查---组织提出出错",e)
continue
logger.info(error_places)
returnInfo = "发现异常地名<br>"
if len(error_places)>0:
for t in error_places:
keyword= t['placeName'].replace("\n","")
# 查找包含关键字的段落
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
returnInfo+="原文:" + yuanwen + "<br>出现异常地名:**" + keyword + "**!请注意" + "<br>"
yield returnInfo
logger.info(returnInfo)
else:
yield "**未发现发现异常地名**"

292
checkRepeatText.py

@ -0,0 +1,292 @@
import uuid
from langchain_chroma import Chroma
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qwen_agent.agents import Assistant
import json_repair
from paddlenlp import Taskflow
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
device_id=0
import re
import time
from docx import Document
import shutil
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkRepeatText")
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
_SerializedRelationships.load_from_xml = load_from_xml_v2
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
#寻找标题名称
def findTitleName(docxPath):
yield '文档相似性检查----检查是否存在详细设计方案'
document = Document(docxPath)
# 逐段读取docx文档的内容
titleWords=[]
firstTitle = 0
secondTitle = 0
sanjiTitle = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
level = isTitle(paragraph)
if level=="0":
firstTitle+=1
secondTitle = 0
if(text.find("附件")>=0):
continue
titleWords.append("一级标题:".format(firstTitle)+text)
elif level=="1":
secondTitle+=1
sanjiTitle=0
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
elif level=="2":
sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
findTitleName_llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
name='Assistant',
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
)
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
类似详细设计方案,详细服务方案详细建设方案为最相关的优先选择
类似设计方案服务方案建设方案为次相关次级选择
类似方案是最后选择
按照这样的顺序选择最合适的
你只能从这两个答案中选择一个{"name":"一级标题名称","answer":"存在"}{"name":"","answer":"不存在"}不做过多的解释,严格按回答格式作答
'''
# print("\n".join(titleWords)+prompt)
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
runList=[]
for rsp in findTitleName_bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
logger.info(parsed_data)
if(parsed_data["answer"]=="存在"):
yield parsed_data["name"]
else:
yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"
#获取文档中 详细设计方案 章节的所有内容
def getDocxToText(docxPath,titleName,vector_store_path):
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
if titleName:
level = isTitle(paragraph)
if(addStart and level=="0"):
addStart=False
if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)):
addStart=True
if level:
levelList.append("{}".format(level)+paragraph.text)
levelText=f"{int(level)+1}级标题-"+text
else:
if addStart:
if(text.startswith("") or text.startswith("注:")):
continue
if(len(text)>30):
i=i+1
words.append("{}".format(levelText)+text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
if len(words)==0:
raise Exception("checkRepeatText,获取长度为0")
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkRepeatText.txt", 'w', ) as txt_file:
txt_file.write(text)
time.sleep(3)
loader = TextLoader(file_path='checkRepeatText.txt')
docs = loader.load()
# print(docs)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10, add_start_index=True,
separators=["\n\n", "\n"])
splits = text_splitter.split_documents(docs)
uuids = []
for i in range(len(splits)):
uuids.append(str(uuid.uuid4()))
logging.info(f"checkRepeatTextuuidLen{len(uuids)}")
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
vectorstore.add_documents(documents=splits, ids=uuids)
while True:
time.sleep(0.3)
ress = vectorstore.similarity_search(words[0])
if (len(ress) > 0):
break
return words,uuids,vectorstore
# @app.route('/checkRepeatText/<filename>', methods=['GET'])
def checkRepeatText(filename):
yield "文档相似性检查---启动中...."
vector_store_path="vector_store"+str(uuid.uuid4())
for titleName in findTitleName(filename):
yield titleName
if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"):
try:
yield "文档相似性检查----文档内容解析中"
words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path)
except Exception as e:
yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败"
return
# 记录程序开始的时间戳‘
global device_id
similarity = Taskflow("text_similarity",device_id=3)
# device_id+=1
# if(device_id>1):
# device_id=0
reslist = []
count = 0
for i in words:
count += 1
yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}"
result = vectorstore.similarity_search(i)
textTag = i.split("")[0]
for content in result:
text = content.page_content
tag = text.split("")[0].replace('\n', '')
if (textTag.find(tag) >= 0):
continue
try:
res = similarity([[i[i.find('') + 1:], text[text.find('') + 1:]]])
except Exception as e:
logger.warning("文档相似性检查--发生异常:",e)
logger.warning(i)
logger.warning(text)
if (res[0]["similarity"] > 0.90):
# 判断重复内容是否被放入
if (len(reslist) > 0):
isExist = False
for neirong in reslist:
if i in neirong.values():
isExist = True
break
if not isExist:
# reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]})
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
else:
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]})
# print(i.split(":")[1] + "\n" + text.split(":")[1])
# vectorstore.delete(ids=uuids)
shutil.rmtree(vector_store_path)
logger.info("已删除")
logger.info(reslist)
resInfo=f"{titleName}章节,发现相似内容:<br>"
if(len(reslist)>0):
for res in reslist:
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find('')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find('') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find('')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find('') + 1:]+"<br>以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】<br>"
yield resInfo
logger.info(resInfo)
else:
yield "未发现相似内容"

173
checkTitleName.py

@ -0,0 +1,173 @@
from docx import Document
from pprint import pprint
from qwen_agent.agents import Assistant
import re
import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
_SerializedRelationships.load_from_xml = load_from_xml_v2
import logging
import logging.config
log_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'standard': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'standard',
'level': logging.INFO,
},
'file': {
'class': 'logging.FileHandler',
'filename': 'Logger.log',
'formatter': 'standard',
'level': logging.INFO,
},
},
'loggers': {
'': {
'handlers': ['console', 'file'],
'level': logging.INFO,
'propagate': True,
},
}
}
logging.config.dictConfig(log_config)
logger = logging.getLogger("checkCompanyName")
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
)
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
#获取文档中 详细设计方案 章节的所有内容
def getDocxToTitleName(docxPath):
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
level = isTitle(paragraph)
if level=="0":
words.append(text)
return words
def checkTitleName(filename):
yield '文档结构检查----启动中'
with open("ce模板.txt", "r",encoding='utf-8') as f:
gettext = f.readlines()
count=0
reserr = []
try:
word = getDocxToTitleName(filename)
except Exception as e:
print(e)
yield "文档无法打开,请检查文档内容"
return
for text in gettext:
count+=1
prompt = f'''
\n 这些是文章的标题请问{text}在标题中是否可以配对的若有请指出是哪个标题若没有请回到不存在
'''
xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
strword = "\n".join(word)+prompt+xushang
# print(strword)
messages = [{'role': 'user', 'content': [{'text':strword}]}]
runList = []
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
# print(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
print(parsed_data)
if(parsed_data["answer"]=="不存在"):
reserr.append(text)
resInfo="文档结构存在异常:<br>"
if(len(reserr)>0):
for i in reserr:
resInfo+="**"+i.replace('\n','')+"**<br>"
logger.info(resInfo)
yield resInfo
else:
yield "文档结构未发现异常"

176
daijian方案.py

@ -0,0 +1,176 @@
from docx import Document
from pprint import pprint
from qwen_agent.agents import Assistant
import re
import json_repair
import math
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
_SerializedRelationships.load_from_xml = load_from_xml_v2
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
)
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
#获取文档中 详细设计方案 章节的所有内容
def getDocxToTitleName(docxPath):
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
level = isTitle(paragraph)
if level=="0":
words.append(text)
return words
def checkTitleName(filename):
prompt = f'''
\n 这些是文章的标题请问{text}在标题中是否可以配对的若有请指出是哪个标题若没有请回到不存在
'''
xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
strword = "\n".join(word) + prompt + xushang
# print(strword)
messages = [{'role': 'user', 'content': [{'text': strword}]}]
runList = []
cishu = 0
for rsp in bot.run(messages):
runList.append(rsp)
# print(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
print(parsed_data)
# yield '文档结构检查----启动中'
# with open("ce模板.txt", "r",encoding='utf-8') as f:
# gettext = f.readlines()
# count=0
# reserr = []
# try:
# word = getDocxToTitleName(filename)
# except Exception as e:
# print(e)
# yield "文档无法打开,请检查文档内容"
# return
# for text in gettext:
# count+=1
# prompt = f'''
# \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在
# '''
# xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释"
# yield f"文档结构检查----结构分析中{count}/{len(gettext)}"
# strword = "\n".join(word)+prompt+xushang
# # print(strword)
# messages = [{'role': 'user', 'content': [{'text':strword}]}]
# runList = []
# cishu = 0
# for rsp in bot.run(messages):
# runList.append(rsp)
# # print(rsp)
# data = runList[len(runList) - 1][0]["content"]
# parsed_data = json_repair.loads(data.replace('`', ''))
# print(parsed_data)
# if(parsed_data["answer"]=="不存在"):
# reserr.append(text)
# resInfo="文档结构存在异常:<br>"
# if(len(reserr)>0):
# for i in reserr:
# resInfo+=f"**{i}**<br>"
# yield resInfo
# else:
# yield "文档结构未发现异常"
import logging
# 创建一个记录器
logger = logging.getLogger('my_logger')
logger.setLevel(logging.DEBUG)
# 创建一个处理器
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# 创建一个格式化器并将其添加到处理器中
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# 将处理器添加到记录器中
logger.addHandler(ch)
try:
# 记录一些日志消息
logger.debug('这是一个调试消息')
logger.info('这是一个信息消息')
logger.warning('这是一个警告消息')
logger.error('这是一个错误消息')
logger.critical('这是一个致命错误消息')
except Exception as e:
logger.warning(e)

712
json_repair.py

@ -0,0 +1,712 @@
"""
This module will parse the JSON file following the BNF definition:
<json> ::= <container>
<primitive> ::= <number> | <string> | <boolean>
; Where:
; <number> is a valid real number expressed in one of a number of given formats
; <string> is a string of valid characters enclosed in quotes
; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
<container> ::= <object> | <array>
<array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
- Add the missing parentheses if the parser believes that the array or object should be closed
- Quote strings or add missing single quotes
- Adjust whitespaces and remove line breaks
All supported use cases are in the unit tests
"""
import os
import json
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
class StringFileWrapper:
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
def __init__(self, fd: TextIO) -> None:
self.fd = fd
self.length: int = 0
def __getitem__(self, index: Union[int, slice]) -> str:
if isinstance(index, slice):
self.fd.seek(index.start)
value = self.fd.read(index.stop - index.start)
self.fd.seek(index.start)
return value
else:
self.fd.seek(index)
return self.fd.read(1)
def __len__(self) -> int:
if self.length < 1:
current_position = self.fd.tell()
self.fd.seek(0, os.SEEK_END)
self.length = self.fd.tell()
self.fd.seek(current_position)
return self.length
class LoggerConfig:
# This is a type class to simplify the declaration
def __init__(self, log_level: Optional[str]):
self.log: List[Dict[str, str]] = []
self.window: int = 10
self.log_level: str = log_level if log_level else "none"
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
class JSONParser:
def __init__(
self,
json_str: Union[str, StringFileWrapper],
json_fd: Optional[TextIO],
logging: Optional[bool],
) -> None:
# The string to parse
self.json_str = json_str
# Alternatively, the file description with a json file in it
if json_fd:
# This is a trick we do to treat the file wrapper as an array
self.json_str = StringFileWrapper(json_fd)
# Index is our iterator that will keep track of which character we are looking at right now
self.index: int = 0
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
self.context: list[str] = []
# Use this to log the activity, but only if logging is active
self.logger = LoggerConfig(log_level="info" if logging else None)
def parse(
self,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
json = self.parse_json()
if self.index < len(self.json_str):
self.log(
"The parser returned early, checking if there's more json elements",
"info",
)
json = [json]
last_index = self.index
while self.index < len(self.json_str):
j = self.parse_json()
if j != "":
json.append(j)
if self.index == last_index:
self.index += 1
last_index = self.index
# If nothing extra was found, don't return an array
if len(json) == 1:
self.log(
"There were no more elements, returning the element without the array",
"info",
)
json = json[0]
if self.logger.log_level == "none":
return json
else:
return json, self.logger.log
def parse_json(
self,
) -> JSONReturnType:
while True:
char = self.get_char_at()
# This parser will ignore any basic element (string or number) that is not inside an array or object
is_in_context = len(self.context) > 0
# False means that we are at the end of the string provided
if char is False:
return ""
# <object> starts with '{'
elif char == "{":
self.index += 1
return self.parse_object()
# <array> starts with '['
elif char == "[":
self.index += 1
return self.parse_array()
# there can be an edge case in which a key is empty and at the end of an object
# like "key": }. We return an empty string here to close the object properly
elif char == "}":
self.log(
"At the end of an object we found a key with missing value, skipping",
"info",
)
return ""
# <string> starts with a quote
elif is_in_context and (char in ['"', "'", ""] or char.isalpha()):
return self.parse_string()
# <number> starts with [0-9] or minus
elif is_in_context and (char.isdigit() or char == "-" or char == "."):
return self.parse_number()
# If everything else fails, we just ignore and move on
else:
self.index += 1
def parse_object(self) -> Dict[str, Any]:
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
obj = {}
# Stop when you either find the closing parentheses or you have iterated over the entire string
while (self.get_char_at() or "}") != "}":
# This is what we expect to find:
# <member> ::= <string> ': ' <json>
# Skip filler whitespaces
self.skip_whitespaces_at()
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
if (self.get_char_at() or "") == ":":
self.log(
"While parsing an object we found a : before a key, ignoring",
"info",
)
self.index += 1
# We are now searching for they string key
# Context is used in the string parser to manage the lack of quotes
self.set_context("object_key")
self.skip_whitespaces_at()
# <member> starts with a <string>
key = ""
while self.get_char_at():
key = str(self.parse_string())
if key != "" or (key == "" and self.get_char_at() == ":"):
# If the string is empty but there is a object divider, we are done here
break
self.skip_whitespaces_at()
# We reached the end here
if (self.get_char_at() or "}") == "}":
continue
self.skip_whitespaces_at()
# An extreme case of missing ":" after a key
if (self.get_char_at() or "") != ":":
self.log(
"While parsing an object we missed a : after a key",
"info",
)
self.index += 1
self.reset_context()
self.set_context("object_value")
# The value can be any valid json
value = self.parse_json()
# Reset context since our job is done
self.reset_context()
obj[key] = value
if (self.get_char_at() or "") in [",", "'", '"']:
self.index += 1
# Remove trailing spaces
self.skip_whitespaces_at()
self.index += 1
return obj
def parse_array(self) -> List[Any]:
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
arr = []
self.set_context("array")
# Stop when you either find the closing parentheses or you have iterated over the entire string
while (self.get_char_at() or "]") != "]":
self.skip_whitespaces_at()
value = self.parse_json()
# It is possible that parse_json() returns nothing valid, so we stop
if value == "":
break
if value == "..." and self.get_char_at(-1) == ".":
self.log(
"While parsing an array, found a stray '...'; ignoring it", "info"
)
else:
arr.append(value)
# skip over whitespace after a value but before closing ]
char = self.get_char_at()
while char and (char.isspace() or char == ","):
self.index += 1
char = self.get_char_at()
# Especially at the end of an LLM generated json you might miss the last "]"
char = self.get_char_at()
if char and char != "]":
self.log(
"While parsing an array we missed the closing ], adding it back", "info"
)
self.index -= 1
self.index += 1
self.reset_context()
return arr
def parse_string(self) -> Union[str, bool, None]:
# <string> is a string of valid characters enclosed in quotes
# i.e. { name: "John" }
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
# Flag to manage corner cases related to missing starting quote
missing_quotes = False
doubled_quotes = False
lstring_delimiter = rstring_delimiter = '"'
char = self.get_char_at()
# A valid string can only start with a valid quote or, in our case, with a literal
while char and char not in ['"', "'", ""] and not char.isalnum():
self.index += 1
char = self.get_char_at()
if not char:
# This is an empty string
return ""
# Ensuring we use the right delimiter
if char == "'":
lstring_delimiter = rstring_delimiter = "'"
elif char == "":
lstring_delimiter = ""
rstring_delimiter = ""
elif char.isalnum():
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
# But remember, object keys are only of type string
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
value = self.parse_boolean_or_null()
if value != "":
return value
self.log(
"While parsing a string, we found a literal instead of a quote",
"info",
)
self.log(
"While parsing a string, we found no starting quote. Will add the quote back",
"info",
)
missing_quotes = True
if not missing_quotes:
self.index += 1
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
if self.get_char_at() == lstring_delimiter:
# If it's an empty key, this was easy
if self.get_context() == "object_key" and self.get_char_at(1) == ":":
self.index += 1
return ""
# Find the next delimiter
i = 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
# Now check that the next character is also a delimiter to ensure that we have "".....""
# In that case we ignore this rstring delimiter
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
self.log(
"While parsing a string, we found a valid starting doubled quote, ignoring it",
"info",
)
doubled_quotes = True
self.index += 1
else:
# Ok this is not a doubled quote, check if this is an empty string or not
i = 1
next_c = self.get_char_at(i)
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c not in [",", "]", "}"]:
self.log(
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
"info",
)
self.index += 1
# Initialize our return value
string_acc = ""
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
# In that case we need to use the ":|,|}" characters as terminators of the string
# So this will stop if:
# * It finds a closing quote
# * It iterated over the entire sequence
# * If we are fixing missing quotes in an object, when it finds the special terminators
char = self.get_char_at()
while char and char != rstring_delimiter:
if missing_quotes:
if self.get_context() == "object_key" and (
char == ":" or char.isspace()
):
self.log(
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
"info",
)
break
elif self.get_context() == "object_value" and char in [",", "}"]:
rstring_delimiter_missing = True
# check if this is a case in which the closing comma is NOT missing instead
i = 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
if next_c:
i += 1
next_c = self.get_char_at(i)
# found a delimiter, now we need to check that is followed strictly by a comma or brace
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c and next_c in [",", "}"]:
rstring_delimiter_missing = False
if rstring_delimiter_missing:
self.log(
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
"info",
)
break
string_acc += char
self.index += 1
char = self.get_char_at()
if char and len(string_acc) > 0 and string_acc[-1] == "\\":
# This is a special case, if people use real strings this might happen
self.log("Found a stray escape sequence, normalizing it", "info")
string_acc = string_acc[:-1]
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
string_acc += escape_seqs.get(char, char) or char
self.index += 1
char = self.get_char_at()
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
if char == rstring_delimiter:
# Special case here, in case of double quotes one after another
if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
self.log(
"While parsing a string, we found a doubled quote, ignoring it",
"info",
)
self.index += 1
elif missing_quotes and self.get_context() == "object_value":
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
i = 1
next_c = self.get_char_at(i)
while next_c and next_c not in [
rstring_delimiter,
lstring_delimiter,
]:
i += 1
next_c = self.get_char_at(i)
if next_c:
# We found a quote, now let's make sure there's a ":" following
i += 1
next_c = self.get_char_at(i)
# found a delimiter, now we need to check that is followed strictly by a comma or brace
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c and next_c == ":":
# Reset the cursor
self.index -= 1
char = self.get_char_at()
self.log(
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
"info",
)
break
else:
# Check if eventually there is a rstring delimiter, otherwise we bail
i = 1
next_c = self.get_char_at(i)
check_comma_in_object_value = True
while next_c and next_c not in [
rstring_delimiter,
lstring_delimiter,
]:
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
# This is because the routine after will make sure to correct any bad guess and this solves a corner case
if check_comma_in_object_value and next_c.isalpha():
check_comma_in_object_value = False
# If we are in an object context, let's check for the right delimiters
if (
("object_key" in self.context and next_c in [":", "}"])
or ("object_value" in self.context and next_c == "}")
or ("array" in self.context and next_c in ["]", ","])
or (
check_comma_in_object_value
and self.get_context() == "object_value"
and next_c == ","
)
):
break
i += 1
next_c = self.get_char_at(i)
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
if next_c == "," and self.get_context() == "object_value":
i += 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
i += 1
next_c = self.get_char_at(i)
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c == "}":
# OK this is valid then
self.log(
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
"info",
)
string_acc += str(char)
self.index += 1
char = self.get_char_at()
elif next_c == rstring_delimiter:
if self.get_context() == "object_value":
# But this might not be it! This could be just a missing comma
# We found a delimiter and we need to check if this is a key
# so find a rstring_delimiter and a colon after
i += 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
i += 1
next_c = self.get_char_at(i)
while next_c and next_c != ":":
if next_c in [
lstring_delimiter,
rstring_delimiter,
",",
]:
break
i += 1
next_c = self.get_char_at(i)
# Only if we fail to find a ':' then we know this is misplaced quote
if next_c != ":":
self.log(
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
"info",
)
string_acc += str(char)
self.index += 1
char = self.get_char_at()
if (
char
and missing_quotes
and self.get_context() == "object_key"
and char.isspace()
):
self.log(
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
"info",
)
self.skip_whitespaces_at()
if self.get_char_at() not in [":", ","]:
return ""
# A fallout of the previous special case in the while loop,
# we need to update the index only if we had a closing quote
if char != rstring_delimiter:
self.log(
"While parsing a string, we missed the closing quote, ignoring",
"info",
)
else:
self.index += 1
return string_acc.rstrip()
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
# <number> is a valid real number expressed in one of a number of given formats
number_str = ""
number_chars = set("0123456789-.eE/,")
char = self.get_char_at()
is_array = self.get_context() == "array"
while char and char in number_chars and (char != "," or not is_array):
number_str += char
self.index += 1
char = self.get_char_at()
if len(number_str) > 1 and number_str[-1] in "-eE/,":
# The number ends with a non valid character for a number/currency, rolling back one
number_str = number_str[:-1]
self.index -= 1
try:
if "," in number_str:
return str(number_str)
if "." in number_str or "e" in number_str or "E" in number_str:
return float(number_str)
elif number_str == "-":
# If there is a stray "-" this will throw an exception, throw away this character
return self.parse_json()
else:
return int(number_str)
except ValueError:
return number_str
def parse_boolean_or_null(self) -> Union[bool, str, None]:
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
starting_index = self.index
char = (self.get_char_at() or "").lower()
value: Optional[Tuple[str, Optional[bool]]]
if char == "t":
value = ("true", True)
elif char == "f":
value = ("false", False)
elif char == "n":
value = ("null", None)
if value:
i = 0
while char and i < len(value[0]) and char == value[0][i]:
i += 1
self.index += 1
char = (self.get_char_at() or "").lower()
if i == len(value[0]):
return value[1]
# If nothing works reset the index before returning
self.index = starting_index
return ""
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
try:
return self.json_str[self.index + count]
except IndexError:
return False
def skip_whitespaces_at(self) -> None:
"""
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
"""
try:
char = self.json_str[self.index]
except IndexError:
return
while char.isspace():
self.index += 1
try:
char = self.json_str[self.index]
except IndexError:
return
def set_context(self, value: str) -> None:
# If a value is provided update the context variable and save in stack
if value:
self.context.append(value)
def reset_context(self) -> None:
self.context.pop()
def get_context(self) -> str:
return self.context[-1]
def log(self, text: str, level: str) -> None:
if level == self.logger.log_level:
context = ""
start = max(self.index - self.logger.window, 0)
end = min(self.index + self.logger.window, len(self.json_str))
context = self.json_str[start:end]
self.logger.log.append(
{
"text": text,
"context": context,
}
)
def repair_json(
json_str: str = "",
return_objects: bool = False,
skip_json_loads: bool = False,
logging: bool = False,
json_fd: Optional[TextIO] = None,
ensure_ascii: bool = True,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
It will return the fixed string by default.
When `return_objects=True` is passed, it will return the decoded data structure instead.
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
"""
parser = JSONParser(json_str, json_fd, logging)
if skip_json_loads:
parsed_json = parser.parse()
else:
try:
if json_fd:
parsed_json = json.load(json_fd)
else:
parsed_json = json.loads(json_str)
except json.JSONDecodeError:
parsed_json = parser.parse()
# It's useful to return the actual object instead of the json string,
# it allows this lib to be a replacement of the json library
if return_objects or logging:
return parsed_json
return json.dumps(parsed_json, ensure_ascii=ensure_ascii)
def loads(
json_str: str,
skip_json_loads: bool = False,
logging: bool = False,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
This function works like `json.loads()` except that it will fix your JSON in the process.
It is a wrapper around the `repair_json()` function with `return_objects=True`.
"""
return repair_json(
json_str=json_str,
return_objects=True,
skip_json_loads=skip_json_loads,
logging=logging,
)
def load(
fd: TextIO, skip_json_loads: bool = False, logging: bool = False
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
This function works like `json.load()` except that it will fix your JSON in the process.
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
"""
return repair_json(
json_fd=fd,
return_objects=True,
skip_json_loads=skip_json_loads,
logging=logging,
)
def from_file(
filename: str,
skip_json_loads: bool = False,
logging: bool = False,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
This function is a wrapper around `load()` so you can pass the filename as string
"""
fd = open(filename)
jsonobj = load(fd, skip_json_loads, logging)
fd.close()
return jsonobj

161
main.py

@ -0,0 +1,161 @@
from flask import Flask, request, jsonify,Response
import os
from checkPlaceName import checkPlaceName
from checkRepeatText import checkRepeatText
from checkCompanyName import checkCompanyName
from checkDocumentError import getDocumentError
from checkTitleName import checkTitleName
from flask_cors import CORS
import qwen_agenttext
app = Flask(__name__)
cros = CORS(app)
UPLOAD_FOLDER = 'uploads'
usableTag=[0,0,0,0,0,0,0,0]
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files['file']
if file.filename == '':
return jsonify({"error": "No selected file"}), 400
if file:
filename = file.filename
file.save(os.path.join(UPLOAD_FOLDER,filename))
return jsonify({"message": "File uploaded successfully"}), 200
@app.route('/stream' ,methods=["GET", "POST"])
def stream_numbers():
context= request.args.get('context')
# def generate_numbers():
# event_id=0
# for number in range(1, 10):
# json_data = json.dumps({"number": number})
# print(json_data)
# event_id += 1
# yield f"id: {event_id}\n"
# yield f"event: time-update\n"
# yield f"data: {json_data}\n\n" # 每次生成一个数字就发送
# time.sleep(0.5) # 为了演示,加入短暂延迟
# json_data = json.dumps({"number": "done"})
# yield f"id: {1}\n"
# yield f"event: time-update\n"
# yield f"data: {json_data}\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(qwen_agenttext.getxinx(context),headers=headers)
@app.route('/sse/checkRepeatText', methods=['GET'])
def checkRepeatTextWeb():
filename = request.args.get('filename')
def generate_checkRepeatText(filename):
id=0
try:
for i in checkRepeatText(filename):
yield f"id: {id+1}\n"
yield f"event: checkRepeatText\n"
yield f"data: {i}\n\n" # 发送完成信号
except Exception as e:
yield f"id: {id+1}\n"
yield f"event: checkRepeatText\n"
yield f"data: **程序出现异常**\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkRepeatText(filename), headers=headers)
@app.route('/sse/checkPlaceName', methods=['GET'])
def checkPlaceNameWebSse():
filename = request.args.get('filename')
def generate_checkPlaceName(filename):
id=0
for i in checkPlaceName(filename):
yield f"id: {id+1}\n"
yield f"event: checkPlaceName\n"
yield f"data: {i}\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkPlaceName(filename), headers=headers)
@app.route('/sse/checkCompanyName', methods=['GET'])
def checkCompanyNameWebSse():
filename = request.args.get('filename')
def generate_checkCompanyName(filename):
id = 0
for i in checkCompanyName(filename):
yield f"id: {id + 1}\n"
yield f"event: checkCompanyName\n"
yield f"data: {i}\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkCompanyName(filename), headers=headers)
@app.route('/sse/checkDocumentErrorWeb', methods=['GET'])
def checkDocumentErrorWebSse():
filename = request.args.get('filename')
def generate_checkDocumentError(filename):
id = 0
for i in getDocumentError(filename):
yield f"id: {id + 1}\n"
yield f"event: getDocumentError\n"
yield f"data: {i}\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkDocumentError(filename), headers=headers)
@app.route('/sse/checkTitleName', methods=['GET'])
def checkTitleNameWebSse():
filename = request.args.get('filename')
def generate_checkTitleName(filename):
id = 0
for i in checkTitleName(filename):
yield f"id: {id + 1}\n"
yield f"event: checkTitleName\n"
yield f"data: {i}\n\n" # 发送完成信号
headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Methods": "GET,POST",
"Access-Control-Allow-Headers": "x-requested-with,content-type",
}
return Response(generate_checkTitleName(filename), headers=headers)
if __name__ == '__main__':
app.run(host="0.0.0.0",port=80)

132
qwen_agenttext.py

@ -0,0 +1,132 @@
import pprint
import urllib.parse
import json5
from qwen_agent.agents import Assistant
from qwen_agent.tools.base import BaseTool, register_tool
import requests
import baidusearch
import tqdm
# 使用示例
# Step 1 (Optional): Add a custom tool named `my_image_gen`.
@register_tool('my_image_gen')
class MyImageGen(BaseTool):
# The `description` tells the agent the functionality of this tool.
description = 'AI painting (image generation) service, input text description, and return the image URL drawn based on text information.'
# The `parameters` tell the agent what input parameters the tool has.
parameters = [{
'name': 'prompt',
'type': 'string',
'description': 'Detailed description of the desired image content, in English',
'required': True
}]
def call(self, params: str, **kwargs) -> str:
# `params` are the arguments generated by the LLM agent.
prompt = json5.loads(params)['prompt']
# 对提示词进行URL编码
prompt = urllib.parse.quote(prompt)
#
return json5.dumps(
{'image_url': f'https://image.pollinations.ai/prompt/{prompt}'},
ensure_ascii=False)
@register_tool('chaxun')
class MyImageGen(BaseTool):
# The `description` tells the agent the functionality of this tool.
description = '如果你不会,请使用此工具进行联网查询'
# The `parameters` tell the agent what input parameters the tool has.
parameters = [{
'name': 'prompt',
'type': 'string',
'description': '请你描述需要提问的信息,以此帮助你了解更多的信息',
'required': True
}]
def call(self, params: str, **kwargs) -> str:
# `params` are the arguments generated by the LLM agent.
prompt = json5.loads(params)['prompt']
# 对提示词进行URL编码
prompt = urllib.parse.quote(prompt)
#
search_tool = baidusearch.search(prompt, num_results=20)
print(search_tool)
return search_tool
# Step 2: Configure the LLM you are using.
# 这里是需要配置模型的地方。需要填写模型名字,以及model_server,即模型所在服务器名字,如果没有,也可以考虑使用api_key。
llm_cfg = {
# Use the model service provided by DashScope:
# model:模型名称
# model_server:模型所在的服务器
# api_key: 所使用到的api-key,可以显示的设置,也可以从环境变量中获取
'model':"qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
# 'api_key': 'YOUR_DASHSCOPE_API_KEY',
# It will use the `DASHSCOPE_API_KEY' environment variable if 'api_key' is not set here.
# Use a model service compatible with the OpenAI API, such as vLLM or Ollama:
# 'model': 'Qwen1.5-7B-Chat',
# 'model_server': 'http://localhost:8000/v1', # base_url, also known as api_base
# 'api_key': 'EMPTY',
# (Optional) LLM hyperparameters for generation:
# 用于调整生成参数的可选配置
'generate_cfg': {
'top_p': 0.8
}
}
# Step 3: Create an agent. Here we use the `Assistant` agent as an example, which is capable of using tools and reading files.
# agent的提示词指令
system_instruction = '''
你是一个乐于助人的助手
收到用户的请求后您应
你应该进行思考判断是否使用工具
如果遇到你不会回答,请使用工具[chaxun]
'''
# 工具列表,指定Assistant可以访问的工具,一个是自定义的工具,一个是代码执行器
tools = ["chaxun"] # `code_interpreter` is a built-in tool for executing code.
# 助理可以读取的文件路径
# files = ['./examples/resource/doc.pdf'] # Give the bot a PDF file to read.
# 初始化Assistant
bot = Assistant(llm=llm_cfg,
system_message=system_instruction,
function_list=tools,
# files=files
)
# Step 4: Run the agent as a chatbot.
messages = [] # This stores the chat history.
def getxinx(context):
# For example, enter the query "draw a dog and rotate it 90 degrees".
# query = input('user query: ')
# Append the user query to the chat history.
messages=[({'role': 'user', 'content': context})]
print(messages)
response = []
event_id = 0
for rsp in bot.run(messages=messages):
response.append(rsp)
yield "请稍等.."
# len()
# for i in bot.run(messages=messages):
# # for number in range(1, 10):
# print(i)
# print(i[len(i)-1]['content'])
# event_id += 1
# yield f"id: {event_id}\n"
# yield f"event: time-update\n"
# if(i[len(i)-1]['role']=='assistant'):
# yield "data: {}\n\n".format(str(i[len(i)-1]['content'].replace('\n\n',''))) # 每次生成一个数字就发送
# else:
# yield f"data: \n\n" # 每次生成一个数字就发送
# Streaming output.

109
test.py

@ -0,0 +1,109 @@
import time
import json
import math
from flask import Flask,Response,request
from flask_sse import sse
from flask_cors import CORS
import re
import qwen_agenttext
app = Flask(__name__)
cros = CORS(app)
# SSE 推送函数
import paddle;
paddle.device.get_available_device()
# SSE 推送路由
# @app.route('/register', methods=["GET"])
# def register():
# 获取客户端标识符
# client_id = str(uuid.uuid4())
#
# # 返回 SSE 响应
# return jsonify({"client_id": client_id})
# SSE 推送路由
# @app.route('/sse', methods=['POST'])
# def stream():
# # 获取客户端标识符
# client_id = 1
# print("client_id", client_id)
#
# def aa():
# # 循环发送 SSE 数据
# for i in range(10):
# data = 'Hello, %s!' % client_id + str(i)
# print(data)
# sse.publish(data, channel=client_id, type='message')
# time.sleep(1)
# sse.publish("end", channel=client_id, type='message')
#
# # 返回 SSE 响应
# response = Response(aa(), mimetype='text/event-stream')
# response.headers.add('Cache-Control', 'no-cache')
# response.headers.add('Connection', 'keep-alive')
# response.headers.add('X-Accel-Buffering', 'no')
# return response
#
#
#
# @app.route('/stream' ,methods=["GET", "POST"])
# def stream_numbers():
# context= request.args.get('context')
#
#
# headers = {
# "Content-Type": "text/event-stream",
# "Cache-Control": "no-cache",
# "X-Accel-Buffering": "no",
# "Access-Control-Allow-Origin": "*",
# "Access-Control-Allow-Methods": "GET,POST",
# "Access-Control-Allow-Headers": "x-requested-with,content-type",
# }
# return Response(generate_numbers(),headers=headers)
# def generate_numbers():
# event_id=0
# # for number in range(1, 10):
# # json_data = json.dumps({"number": number})
# # print(json_data)
# # event_id += 1
# # yield f"id: {event_id}\n"
# # yield f"event: time-update\n"
# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送
# json_data = json.dumps({"number": "done"})
# yield f"id: {1}\n"
# yield f"event: time-update\n"
# yield f"data: 34568\n\n" # 发送完成信号
# if __name__ == '__main__':
#
#
# # 读取文件内容
# with open("checkPlaceName.txt", "r", encoding='utf-8') as f:
# gettext = f.read()
# batchNum=20
# sentences = re.split(r'[。\n]', gettext)
# # 去掉空字符
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# # 计算总字符数
# total_chars = len(sentences)
#
# # 计算有多少份
# num_chunks = math.ceil(total_chars / batchNum)
#
# # 按batchNum字为一份进行处理
# chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
#
# # 打印每一份的内容
# for i, chunk in enumerate(chunks):
# print(f"Chunk {i + 1}:")
# print(chunk)
# print("-" * 40)
#
# # 打印总份数
# print(f"Total chunks: {num_chunks}")
# app.run(debug=True,port=80)

BIN
workspace/1.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 420 KiB

BIN
workspace/image14.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 245 KiB

BIN
workspace/image15.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

BIN
workspace/image16.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

BIN
workspace/image17.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

BIN
workspace/image18.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

BIN
workspace/image19.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

BIN
workspace/image20.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

BIN
workspace/tools/code_interpreter/05613c9c-c910-455d-8c8b-62b7dc243b2a.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 211 KiB

BIN
workspace/tools/code_interpreter/1560f103-f2dc-49e3-88c2-35f5d500bc1d.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 916 KiB

BIN
workspace/tools/code_interpreter/4aa3a1fe-7fc2-440f-8bd9-653ee1721776.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 217 KiB

BIN
workspace/tools/code_interpreter/54b7ad57-9c89-4977-b49a-eaf7e60b9656.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 252 KiB

BIN
workspace/tools/code_interpreter/c8cba059-ac85-42b0-b197-1c8e1e7182c9.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 904 KiB

12
workspace/tools/code_interpreter/kernel_connection_file_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.json

@ -0,0 +1,12 @@
{
"shell_port": 3199,
"iopub_port": 3205,
"stdin_port": 3200,
"control_port": 3201,
"hb_port": 3209,
"ip": "127.0.0.1",
"key": "41711130-ba4287db5e2a6e7b98444c31",
"transport": "tcp",
"signature_scheme": "hmac-sha256",
"kernel_name": ""
}

12
workspace/tools/code_interpreter/kernel_connection_file_113f0326-0345-475c-85c1-86af71d668c0_24876.json

@ -0,0 +1,12 @@
{
"shell_port": 36295,
"iopub_port": 36301,
"stdin_port": 36296,
"control_port": 36297,
"hb_port": 36305,
"ip": "127.0.0.1",
"key": "0faec31a-0f91a316abd70cf50f57dbad",
"transport": "tcp",
"signature_scheme": "hmac-sha256",
"kernel_name": ""
}

12
workspace/tools/code_interpreter/kernel_connection_file_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.json

@ -0,0 +1,12 @@
{
"shell_port": 5355,
"iopub_port": 5362,
"stdin_port": 5356,
"control_port": 5358,
"hb_port": 5366,
"ip": "127.0.0.1",
"key": "de89d28a-7beb5da33100363d2c20fd6b",
"transport": "tcp",
"signature_scheme": "hmac-sha256",
"kernel_name": ""
}

12
workspace/tools/code_interpreter/kernel_connection_file_a3131ded-afec-43fa-95eb-d2f35548a411_39868.json

@ -0,0 +1,12 @@
{
"shell_port": 3079,
"iopub_port": 3085,
"stdin_port": 3080,
"control_port": 3081,
"hb_port": 3089,
"ip": "127.0.0.1",
"key": "1825b8a3-a33137bc69e3375f26f384a3",
"transport": "tcp",
"signature_scheme": "hmac-sha256",
"kernel_name": ""
}

12
workspace/tools/code_interpreter/kernel_connection_file_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.json

@ -0,0 +1,12 @@
{
"shell_port": 36740,
"iopub_port": 36746,
"stdin_port": 36741,
"control_port": 36742,
"hb_port": 36750,
"ip": "127.0.0.1",
"key": "ac6de478-4a3be71d79c2c63da7065148",
"transport": "tcp",
"signature_scheme": "hmac-sha256",
"kernel_name": ""
}

12
workspace/tools/code_interpreter/kernel_connection_file_d624f7a6-914d-48c1-b902-4e298f92b671_20484.json

@ -0,0 +1,12 @@
{
"shell_port": 2563,
"iopub_port": 2569,
"stdin_port": 2564,
"control_port": 2565,
"hb_port": 2573,
"ip": "127.0.0.1",
"key": "7e020774-be96933cbe5aaad90c1c9bfc",
"transport": "tcp",
"signature_scheme": "hmac-sha256",
"kernel_name": ""
}

12
workspace/tools/code_interpreter/kernel_connection_file_ec74ca73-6455-4a78-96b1-542747f19a25_39260.json

@ -0,0 +1,12 @@
{
"shell_port": 5840,
"iopub_port": 5846,
"stdin_port": 5841,
"control_port": 5842,
"hb_port": 5850,
"ip": "127.0.0.1",
"key": "e4c27d68-1c3a9dfa16551f35481b05b8",
"transport": "tcp",
"signature_scheme": "hmac-sha256",
"kernel_name": ""
}

3
workspace/tools/code_interpreter/launch_kernel_0eb57682-3a22-44c8-bedb-a4871b813c3c_19796.py

@ -0,0 +1,3 @@
from ipykernel import kernelapp as app
app.launch_new_instance()

3
workspace/tools/code_interpreter/launch_kernel_113f0326-0345-475c-85c1-86af71d668c0_24876.py

@ -0,0 +1,3 @@
from ipykernel import kernelapp as app
app.launch_new_instance()

3
workspace/tools/code_interpreter/launch_kernel_599899c4-4f00-44c1-bba5-1bcc31eb535c_12240.py

@ -0,0 +1,3 @@
from ipykernel import kernelapp as app
app.launch_new_instance()

3
workspace/tools/code_interpreter/launch_kernel_a3131ded-afec-43fa-95eb-d2f35548a411_39868.py

@ -0,0 +1,3 @@
from ipykernel import kernelapp as app
app.launch_new_instance()

3
workspace/tools/code_interpreter/launch_kernel_b4447d65-4542-4bd2-89ff-b33b5fb00ac5_1068.py

@ -0,0 +1,3 @@
from ipykernel import kernelapp as app
app.launch_new_instance()

3
workspace/tools/code_interpreter/launch_kernel_d624f7a6-914d-48c1-b902-4e298f92b671_20484.py

@ -0,0 +1,3 @@
from ipykernel import kernelapp as app
app.launch_new_instance()

3
workspace/tools/code_interpreter/launch_kernel_ec74ca73-6455-4a78-96b1-542747f19a25_39260.py

@ -0,0 +1,3 @@
from ipykernel import kernelapp as app
app.launch_new_instance()

BIN
workspace/tools/code_interpreter/temp_image.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 MiB

1
workspace/tools/doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_500

File diff suppressed because one or more lines are too long

8699
workspace/tools/simple_doc_parser/53dea512c5e030d7ad12f34dceaecc2a3c5bcb058907ae3495d60e5876b079a2_ori

File diff suppressed because one or more lines are too long

140
代码段存储.py

@ -0,0 +1,140 @@
from docx import Document
from paddlenlp import Taskflow
from pprint import pprint
from qwen_agent.agents import Assistant
import re
import json_repair
import time
import math
tagTask = Taskflow("ner")
prompt='''
.上述文本判断地名是否正确你可以使用工具利用互联网查询你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{placeName:地名,"回答":"答案"},{placeName:地名,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
不做过多的解释,严格按回答格式作答;
'''
# prompt='''
# .请回答以上问题,
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答;
# 不做过多的解释,严格按回答格式作答;
# '''
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
#获取全文内容
def getDocxToTextAll(name):
docxPath=name
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
print("placeNameTask",len(words))
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
#得到全文和地名有关的内容
def placeNameTask(text):
batchNum=20
sentences = re.split(r'[。\n]', text)
# 去掉空字符
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
# 计算总字符数
total_chars = len(sentences)
# 计算有多少份
num_chunks = math.ceil(total_chars / batchNum)
# 按batchNum字为一份进行处理
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]
placeList = []
# 打印每一份的内容
for i, chunk in enumerate(chunks):
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}"
wenBen=".".join(chunk)
print(chunk)
res = tagTask(wenBen)
isplace = False
for zuhe in res:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
if isplace:
name = placeList[len(placeList) - 1]
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: # or zuhe[1] == "ns"
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0:
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList.append(new_text)
else:
isplace = False
print("-" * 40)
# 打印总份数
yield "文档地名检查---文档解析完成"
placeList=list(dict.fromkeys(placeList))
yield placeList
#主方法
def checkPlaceName(filename):
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送
getDocxToTextAll(filename)
with open("checkPlaceName.txt", "r",encoding='utf-8') as f:
gettext = f.read()
yield f"文档地名检查---开始解析文档..." # 每次生成一个数字就发送
# propnList=placeNameTask(gettext)
for item in placeNameTask(gettext):
if isinstance(item, str):
yield item
else:
final_list = item # 获取最终结果
propnStr = ",".join(final_list)
print("placeNameTask",propnStr)
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
runList = []
yield f"文档地名检查---结果生成中..." # 每次生成一个数字就发送
cishu=0
for rsp in bot.run(messages):
runList.append(rsp)
if cishu>3:
cishu=0
yield "文档地名检查---结果生成中"+'.'*cishu
cishu+=1
data = runList[len(runList) - 1][0]["content"]
print("placeNameTask",data)
parsed_data = json_repair.loads(data.replace('`', ''))
# 如果需要进一步操作,例如只关注“正确”的回答
error_places = [place for place in parsed_data if place['回答'] == '错误']
print("placeNameTask",error_places)
returnInfo = "发现异常地名<br />";
if len(error_places)>0:
for t in error_places:
keyword= t['placeName']
# 查找包含关键字的段落
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","")
returnInfo+="原文:" + yuanwen + "<br />出现异常地名:**" + keyword + "**!请注意" + "<br />";
yield returnInfo
print(returnInfo)
else:
yield "**未发现发现异常地名**"

118
文档一二级标题识别与提取.py

@ -0,0 +1,118 @@
import re
import time
from docx import Document
from pprint import pprint
# from paddlenlp import Taskflow
#
# similarity = Taskflow("text_similarity", truncation=True, max_length=102400)
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
def getDocxToText12biaoti(name):
document = Document(name)
# 逐段读取docx文档的内容
levelList=[]
words=[]
levelText=""
i = 0
firstTitle = 0
secondTitle = 0
sanjiTitle = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
# print("非空")
# words.append(text)
level = isTitle(paragraph)
if level=="0":
firstTitle+=1
secondTitle = 0
if(text.find("附件")>=0):
continue
words.append("{}:".format(firstTitle)+text)
elif level=="1":
secondTitle+=1
sanjiTitle=0
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
words.append("{}.{}".format(firstTitle,secondTitle)+text)
elif level=="2":
sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
words.append("{}.{}.{}".format(firstTitle, secondTitle,sanjiTitle) + text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
print(len(words))
if len(words)==0:
raise Exception("I know python!")
text = '\n'.join(words)
with open("ce1.txt", 'w',encoding="utf-8") as txt_file:
txt_file.write(text)
return words
mobanList=[]
dangqianList=[]
errorList =[]
# 将文本写入txt文件
# with open("ce模板.txt", 'r',encoding="utf-8") as txt_file:
# for i in txt_file:
# i=re.sub(r'[\t\n]', '', i)
# mobanList.append(i)
# pprint(mobanList)
# dangqianList=getDocxToText12biaoti("1.docx")
# if len(dangqianList)!=len(mobanList):
# print("标题数量与模板不一致")
# for num in range(len(mobanList)):
# moban = mobanList[num]
# dangqian= dangqianList[num]
# fenshu=similarity([[dangqian,moban]])
# pprint(fenshu)
# if (fenshu[0]["similarity"]<0.85):
# errorList.append(dangqianList)
# getDocxToText12biaoti("1.docx")
# pprint(errorList)
prompt = '''{}这是文档大纲,根据大纲分析文档中是否有{}这块内容的描述,若不存在请回答不存在
'''
dagang ="1"
biaozhun="2"
print(prompt.format(dagang, biaozhun))

282
文档图片提取.py

@ -0,0 +1,282 @@
import re
import os
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.parts.image import ImagePart
from qwen_agent.agents import Assistant
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
import shutil
import re
import json_repair
import uuid
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
# 该行只能有一个图片
def is_image(graph: Paragraph, doc: Document):
images = graph._element.xpath('.//pic:pic') # 获取所有图片
for image in images:
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart):
return True
return False
# 获取图片(该行只能有一个图片)
def get_ImagePart(graph: Paragraph, doc: Document):
images = graph._element.xpath('.//pic:pic') # 获取所有图片
for image in images:
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart):
return part
return None
#寻找标题名称
def findTitleName(docxPath):
yield '文档图片信息检查----检查是否存在详细设计方案'
document = docx.Document(docxPath)
# 逐段读取docx文档的内容
titleWords=[]
firstTitle = 0
secondTitle = 0
sanjiTitle = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
level = isTitle(paragraph)
if level=="0":
firstTitle+=1
secondTitle = 0
if(text.find("附件")>=0):
continue
titleWords.append("一级标题:".format(firstTitle)+text)
elif level=="1":
secondTitle+=1
sanjiTitle=0
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
elif level=="2":
sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
findTitleName_llm_cfg = {
# 'model':"qwen2-72b",
# 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
'model': "qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
name='Assistant',
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
)
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
类似详细设计方案,详细服务方案详细建设方案为最相关的优先选择
类似设计方案服务方案建设方案为次相关次级选择
类似方案是最后选择
按照这样的顺序选择最合适的
你只能从这两个答案中选择一个{"name":"一级标题名称","answer":"存在"}{"name":"","answer":"不存在"}不做过多的解释,严格按回答格式作答
'''
# print("\n".join(titleWords)+prompt)
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
runList=[]
for rsp in findTitleName_bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
print(parsed_data)
if(parsed_data["answer"]=="存在"):
print("存在",parsed_data["name"])
yield parsed_data["name"]
else:
print("不存在",parsed_data["name"])
yield "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
def saveImage(fileName,titleName,imagePath):
fristName=""
doc = docx.Document(fileName)
for paragraph in doc.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip(): # 非空判断
level = isTitle(paragraph)
if level == "0":
fristName = text
print(text)
if level:
levelText = f"{int(level) + 1}级标题-" + text
else:
# 空说明是表格或者图片
r = is_image(paragraph, doc)
if r and fristName == titleName:
part = get_ImagePart(paragraph, doc)
img_name = levelText+"_"+ os.path.basename(part.partname)
with open(f'{imagePath}/{img_name}', "wb") as f:
f.write(part.blob)
#保存完成后,上传大模型进行分析
def checkImageText(filename):
llm_cfg_vl = {
#'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
'model':"qwen-vl-max",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
botImage = Assistant(llm=llm_cfg_vl,
name='Assistant',
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
)
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
for titleName in findTitleName(filename):
yield titleName
if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"):
yield "文档图片信息检查----文档内容解析中"
imagePath = "Image" + str(uuid.uuid4())
os.mkdir(imagePath)
saveImage(filename,titleName,imagePath)
imagePathList = os.listdir(imagePath)
count = 0
resMap={}
for image in imagePathList:
count+=1
yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}"
outpath=os.path.join("imagePath", image)
print(outpath)
messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
runListImage = []
for rsp in botImage.run(messagesImage):
runListImage.append(rsp)
data = runListImage[len(runListImage) - 1][0]["content"]
print(str(data))
prompt='''
依次上述内容是否与文档有关你只能在[无关有关]选项中选择答案,
按照这样的格式回答[{text内容,"answer":"答案"},{text内容,"answer":"答案"}]不做过多的解释,严格按回答格式作答
'''
messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}]
runList = []
for rsp in bot.run(messages):
runList.append(rsp)
textdata = runList[len(runList) - 1][0]["content"]
print(textdata)
parsed_data = json_repair.loads(textdata)
print(parsed_data)
for res in parsed_data:
if (res["answer"] == "无关"):
print("无关", res["name"])
map = resMap.get(image)
if map:
#存在map说明之前已经保存过了
resMap[image]=map+""+res["text"]
else:
resMap[image]=res["text"]
out=''
if(len(resMap)>0):
for key,value in resMap:
out+=f"{key}图片中,{value}以上内容在文档中未出现相关描述<br>"
yield out
else:
yield "文档图片信息检查----图文符合要求"
shutil.rmtree(imagePath)
# except Exception as e:
# yield f"文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
# return
for i in checkImageText("1.docx"):
print(i)
# import docx
# doc = docx.Document('1.docx')
# dict_rel = doc.part._rels # rels其实是个目录
# for rel in dict_rel:
# rel = dict_rel[rel]
# print("rel", rel.target_ref)
# if "image" in rel.target_ref:
# # create_dir(desc_path)
# img_name = re.findall("/(.*)", rel.target_ref)[0] # windos:/
# print("img_name", img_name)
# word_name = os.path.splitext("1.docx")[0]
# print("word_name", word_name)
# #检查文件路径分隔符(os.sep),并根据不同的操作系统(Windows或Unix/Linux)处理文件名。
# if os.sep in word_name:
# new_name = word_name.split('\\')[-1]
# else:
# new_name = word_name.split('/')[-1]
# img_name = f'{new_name}_{img_name}'
# print(img_name)
# desc_path='workspace'
# with open(f'{desc_path}/{img_name}', "wb") as f:
# f.write(rel.target_part.blob)
# #
# # # prompt='''
# # # .根据上述文本判断,是否为非泛化的公司或组织名称,你可以使用工具利用互联网查询,你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校,委员单位]选项中选择答案,回答格式[{“placeName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
# # # '''
# llm_cfg_vl = {
# #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
# 'model':"qwen-vl-max",
# 'model_server': 'DashScope', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
# }
# botvl = Assistant(llm=llm_cfg_vl,
# name='Assistant',
# # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
# )
# messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
# runList = []
# for rsp in botvl.run(messages):
# runList.append(rsp)
# print(rsp)
# data = runList[len(runList) - 1][0]["content"]
# print(str(data))

133
服务器文件/checkCompanyName.py

@ -0,0 +1,133 @@
# -*- coding:utf-8 -*-
import time
from docx import Document
from paddlenlp import Taskflow
from qwen_agent.agents import Assistant
import re
import json_repair
wordtag = Taskflow("knowledge_mining")
prompt = '''
.根据上述文本判断是否为具体的公司或组织名称你可以使用工具利用互联网查询
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校行业类型其他]选项中选择答案,
回答格式[{companyName名称,"回答":"答案"}{companyName名称,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
'''
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"
)
def getDocxToTextAll(name):
docxPath=name
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
print("checkCompanyName",len(words))
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
def checkCompanyName(filename):
getDocxToTextAll(filename)
start_time=time.time()
error_places = []
for batch in read_file_in_batches('checkCompanyName.txt'):
res=process_batch(batch)
if(len(res)>0):
error_places.extend(res)
print(error_places)
end_time = time.time()
# 计算执行时间
elapsed_time = end_time - start_time
print(f"checkCompanyName程序执行时间: {elapsed_time}")
return error_places
def read_file_in_batches(file_path, batch_size=5000):
"""
分批读取文本文件
:param file_path: 文件路径
:param batch_size: 每批处理的字符数
:return: 生成器每次返回一批文本
"""
with open(file_path, 'r', encoding='utf-8') as file:
batch = []
char_count = 0
for line in file:
batch.append(line)
char_count += len(line)
if char_count >= batch_size:
yield ''.join(batch)
batch = []
char_count = 0
if batch:
yield ''.join(batch)
def process_batch(batch):
"""
处理一批文本
:param batch: 一批文本
"""
# 在这里添加你的处理逻辑
# sentences = re.split(r'[。\n]', batch)
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
res=wordtag(batch)
placeList = []
isplace = False
for zuhe in res[0]['items']:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
zhi = zuhe.get("wordtag_label")
if isplace:
name = placeList[len(placeList) - 1]
if zhi.find("组织机构类")>=0 : # or zuhe[1] == "ns"
isplace = True
new_text = zuhe['item'].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
if zhi.find("组织机构类")>=0 :
isplace = True
new_text = zuhe['item'].replace("\n", "")
placeList.append(new_text)
else:
isplace = False
placeList=list(dict.fromkeys(placeList))
placeStr = ",".join(placeList)
messages = [{'role': 'user', 'content': [{'text': placeStr+prompt}]}]
print("checkCompanyName",placeStr+prompt)
runList = []
for rsp in bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
print("checkCompanyName",data)
parsed_data = json_repair.loads(data.replace('`', ''))
error_places = [place for place in parsed_data if place['回答'] == '具体的公司或组织名称']
print("checkCompanyName",error_places)
if len(error_places)>0:
for t in error_places:
keyword= t['companyName']
# 查找包含关键字的段落
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch)
t["yuanwen"]=paragraphs[0]
return error_places
else:
return error_places

226
服务器文件/checkDocumentError.py

@ -0,0 +1,226 @@
#-*- coding:utf-8 -*-
# from pycorrector import MacBertCorrector
# m = MacBertCorrector("shibing624/macbert4csc-base-chinese")
from qwen_agent.agents import Assistant
from docx import Document
from pprint import pprint
import re
from paddlenlp import Taskflow
import json
import time
import json_repair
print(json_repair.loads('{"name":""aaaa"}'))
start_time = time.time()
corrector = Taskflow("text_correction")
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
# prompt='''
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案,
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答;
# '''
prompt='''
请回答以上问题[]选项中选择答案,原文内容标点符号保持不变如果有错请给出解析没有错则不用给解析
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","jianyi","解析"},{"placeName":"序号","回答":"答案","jianyi","解析"}]不做过多的解释,严格按回答格式作答;
'''
def getDocxToTextAll(name):
docxPath=name
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
print("checkDocumentError",len(words))
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
def getDocumentError(filename):
getDocxToTextAll(filename)
error_places = []
# # 打开文件
for batch in read_file_in_batches('checkDocumentError.txt'):
res=process_batch(batch)
if(len(res)>0):
error_places.extend(res)
pprint(error_places)
end_time = time.time()
# 计算执行时间
elapsed_time = end_time - start_time
print(f"checkDocumentError程序执行时间: {elapsed_time}")
return error_places
#
# 过滤掉填充的None(如果有的话)
# chunk = [line for line in chunk if line is not None]
# res = m.correct_batch(sentences)
# print("DocumentError",res)
# lines_with_greeting = [place for place in res if len( place['errors'])>0]
# error_places.extend(lines_with_greeting)
# pprint(error_places)
# if len(lines_with_greeting)>0:
# for t in error_places:
# keyword= t['source']
#
# errorWord=t["errors"]
# # 查找包含关键字的段落
# paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
# t["yuanwen"]=paragraphs[0]
# return error_places
# else:
# return error_places
# return lines_with_greeting
def read_file_in_batches(file_path, batch_size=5000):
"""
分批读取文本文件
:param file_path: 文件路径
:param batch_size: 每批处理的字符数
:return: 生成器每次返回一批文本
"""
with open(file_path, 'r', encoding='utf-8') as file:
batch = []
char_count = 0
for line in file:
batch.append(line)
char_count += len(line)
if char_count >= batch_size:
yield ''.join(batch)
batch = []
char_count = 0
if batch:
yield ''.join(batch)
def process_batch(batch):
"""
处理一批文本
:param batch: 一批文本
"""
# 在这里添加你的处理逻辑
# error_places=[]
sentences = re.split(r'[。\n]', batch)
# 去掉空字符串
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
res = corrector(sentences)
lines_with_greeting = [place for place in res if len(place['errors']) > 0]
# error_places.extend(lines_with_greeting)
# pprint(error_places)
words=''
err=[]
if len(lines_with_greeting) > 0:
num=0
wenti=[]#记录问题的数组
keyword_list = []#记录问题
for t in lines_with_greeting:
temp_errorWords = []
keyword = t['source']
keyword_list.append(keyword)
for item in t["errors"]:
for key, value in item['correction'].items():
temp_errorWords.append(key)
wenti.append("{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num,keyword,",".join(temp_errorWords)))
num+=1
words ="\n".join(wenti)
messages = [{'role': 'user', 'content': [{'text': words+ prompt}]}]
runList = []
print(words+ prompt)
for rsp in bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
pprint(data)
parsed_data = json_repair.loads(data.replace("\\","").replace('`', ''))
err = [
{**place, "placeName": keyword_list[int(place["placeName"])],"jianyi":place["解析"]}
for place in parsed_data
if place['回答'] == ''
]
pprint(err)
# err = [place["placeName"]=keyword_list[int(place["placeName"])] for place in parsed_data if place['回答'] == '是']
# if len(err) > 0:
# # for t in error_places:
# # keyword = t['placeName']
# # # 查找包含关键字的段落
# # paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext)
# # t["yuanwen"] = paragraphs[0]
# return err
# else:
return err
# from flask import Flask, request, jsonify
# import os
# # from checkPlaceName import checkPlaceName
# # from checkRepeatText import checkRepeatText
# # from checkCompanyName import checkCompanyName
# # from documentError import getDocumentError
# app = Flask(__name__)
# UPLOAD_FOLDER = 'uploads'
# if not os.path.exists(UPLOAD_FOLDER):
# os.makedirs(UPLOAD_FOLDER)
# @app.route('/upload', methods=['POST'])
# def upload_file():
# if 'file' not in request.files:
# return jsonify({"error": "No file part"}), 400
# file = request.files['file']
# if file.filename == '':
# return jsonify({"error": "No selected file"}), 400
# if file:
# filename = file.filename
# file.save(os.path.join(UPLOAD_FOLDER,filename))
# return jsonify({"message": "File uploaded successfully"}), 200
# # @app.route('/checkPlaceName/<filename>', methods=['GET'])
# # def checkPlaceNameWeb(filename):
# # return checkPlaceName(filename)
# # @app.route('/checkRepeatText/<filename>', methods=['GET'])
# # def checkRepeatTextWeb(filename):
# # return checkRepeatText(filename)
# # @app.route('/checkCompanyName/<filename>', methods=['GET'])
# # def checkCompanyNameWeb(filename):
# # return checkCompanyName(filename)
# # @app.route('/checkDocumentErrorWeb/<filename>', methods=['GET'])
# # def checkDocumentErrorWeb(filename):
# # return getDocumentError(filename)
# if __name__ == '__main__':
# app.run(host='0.0.0.0',port=80)
# from transformers import AutoTokenizer, AutoModel, GenerationConfig,AutoModelForCausalLM
# import os
# os.environ['NPU_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7'
# os.environ['ASCEND_RT_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7'
# import torch
# import torch_npu
# from torch_npu.contrib import transfer_to_npu
# from accelerate import Accelerator
# # device = 'cpu'
# accelerator = Accelerator()
# # torch_device = "npu" # 0~7
# # torch.npu.set_device(torch.device(torch_device))
# devices = []
# for i in range(8):
# devices.append(f"npu:{i}")
# print(devices)
# torch.npu.set_device(devices)
# torch.npu.set_compile_mode(jit_compile=False)
# model_name_or_path = '/mnt/sdc/qwen/Qwen2-72B-Instruct'
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
# # model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, device_map="auto",torch_dtype=torch.float16)
# model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, device_map=accelerator,torch_dtype=torch.float16).npu().eval()

153
服务器文件/checkPlaceName.py

@ -0,0 +1,153 @@
from docx import Document
from paddlenlp import Taskflow
from pprint import pprint
from qwen_agent.agents import Assistant
import re
import json_repair
import time
tagTask = Taskflow("ner")
prompt='''
.上述文本判断地名是否正确你可以使用工具利用互联网查询你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{placeName:地名,"回答":"答案"},{placeName:地名,"回答":"答案"}]不做过多的解释,严格按回答格式作答;
不做过多的解释,严格按回答格式作答;
'''
# prompt='''
# .请回答以上问题,
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答;
# 不做过多的解释,严格按回答格式作答;
# '''
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b",
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
#获取全文内容
def getDocxToTextAll(name):
docxPath=name
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
# print("非空")
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
print("placeNameTask",len(words))
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file:
txt_file.write(text)
#得到全文和地名有关的内容
def placeNameTask(text):
res = tagTask(text)
print(res)
placeList = []
isplace = False
for zuhe in res:
# 上一个的地名,这一个还是地名,就和上一个相加代替这个
if isplace:
name = placeList[len(placeList) - 1]
if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0:# or zuhe[1] == "ns"
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList[len(placeList) - 1] = name + new_text
continue
if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0:
isplace = True
new_text = zuhe[0].replace("\n", "")
placeList.append(new_text)
else:
isplace = False
placeList=list(dict.fromkeys(placeList))
return placeList
#主方法
def checkPlaceName(filename):
getDocxToTextAll(filename)
start_time=time.time()
error_places = []
for batch in read_file_in_batches('checkPlaceName.txt'):
res=process_batch(batch)
if(len(res)>0):
error_places.extend(res)
pprint(error_places)
end_time = time.time()
# 计算执行时间
elapsed_time = end_time - start_time
print(f"checkPlaceName程序执行时间: {elapsed_time}")
return error_places
def read_file_in_batches(file_path, batch_size=5000):
"""
分批读取文本文件
:param file_path: 文件路径
:param batch_size: 每批处理的字符数
:return: 生成器每次返回一批文本
"""
with open(file_path, 'r', encoding='utf-8') as file:
batch = []
char_count = 0
for line in file:
batch.append(line)
char_count += len(line)
if char_count >= batch_size:
yield ''.join(batch)
batch = []
char_count = 0
if batch:
yield ''.join(batch)
def process_batch(batch):
"""
处理一批文本
:param batch: 一批文本
"""
# 在这里添加你的处理逻辑
# sentences = re.split(r'[。\n]', batch)
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
propnList=placeNameTask(batch)
# words=[]
# for placeName in propnList:
# word="原文:{},先从分析原文是否含有错误地名,若含有错误地名,请回答包含错误地名,若不包含错误地名,请从【具体的公司或组织名称,非具体的公司或组织名称,与政府有关的公司或组织名称,其他组织名称,地名】中选择最合适的一个作为答案".format(placeName)
# words.append(word)
propnStr = ",".join(propnList)
print("placeNameTask",propnStr)
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}]
runList = []
for rsp in bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
print("placeNameTask",data)
parsed_data = json_repair.loads(data.replace('`', ''))
# 遍历列表
for item in parsed_data:
print(f"地名: {item['placeName']}, 回答: {item['回答']}")
# 如果需要进一步操作,例如只关注“正确”的回答
error_places = [place for place in parsed_data if place['回答'] == '错误']
print("placeNameTask",error_places)
if len(error_places)>0:
for t in error_places:
keyword= t['placeName']
# 查找包含关键字的段落
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch)
t["yuanwen"]=paragraphs[0]
return error_places
else:
return error_places

160
服务器文件/checkRepeatText.py

@ -0,0 +1,160 @@
import uuid
from langchain_chroma import Chroma
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from paddlenlp import Taskflow
similarity = Taskflow("text_similarity" , truncation=True,max_length=102400)
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13")
vector_store_path="vector_store"
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
import re
import time
from docx import Document
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
#获取文档中 详细设计方案 章节的所有内容
def getDocxToText(docxPath,titleName):
document = Document(docxPath)
# 逐段读取docx文档的内容
levelList=[]
words=[]
addStart = False
levelText=""
i = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
print("非空")
if titleName:
level = isTitle(paragraph)
if(addStart and level=="0"):
addStart=False
if(level=="0" and text.find(titleName)>=0):
addStart=True
if level:
levelList.append("{}".format(level)+paragraph.text)
levelText=text
else:
if addStart:
if(text.startswith("") or text.startswith("注:")):
continue
i=i+1
words.append("{}个段落:".format(i)+text)
else:
words.append(text)
# 将所有段落文本拼接成一个字符串,并用换行符分隔
print("checkRepeatText",len(words))
if len(words)==0:
raise Exception("I know python!")
text = '\n'.join(words)
# 将文本写入txt文件
with open("checkRepeatText.txt", 'w', ) as txt_file:
txt_file.write(text)
time.sleep(3)
loader = TextLoader(file_path='checkRepeatText.txt')
docs = loader.load()
# print(docs)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10, add_start_index=True,
separators=["\n\n", "\n"])
splits = text_splitter.split_documents(docs)
uuids = []
print(len(splits))
for i in range(len(splits)):
uuids.append(str(uuid.uuid4()))
print(len(uuids))
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings)
vectorstore.add_documents(documents=splits, ids=uuids)
while True:
time.sleep(0.3)
ress = vectorstore.similarity_search(words[0])
if (len(ress) > 0):
break
return words,uuids
# @app.route('/checkRepeatText/<filename>', methods=['GET'])
def checkRepeatText(filename,titleName):
words,uuids=getDocxToText(filename,titleName)
try:
# 记录程序开始的时间戳‘
reslist = []
count = 0
for i in words:
count += 1
result = vectorstore.similarity_search(i)
textTag = i.split("")[0]
print(i)
for content in result:
text = content.page_content
tag = text.split("")[0].replace('\n', '')
if (textTag.find(tag) >= 0):
continue
res = similarity([[i[i.find('') + 1:], text[text.find('') + 1:]]])
print(res[0]["similarity"])
if (res[0]["similarity"] > 0.95):
# 判断重复内容是否被放入
if (len(reslist) > 0):
isExist = False
for neirong in reslist:
if i[i.find('') + 1:] in neirong.values():
isExist = True
break
if not isExist:
reslist.append({"yuanwen1":i[i.find('') + 1:],"yuanwen2":text[text.find('') + 1:]})
print(reslist)
else:
reslist.append({"yuanwen1":i[i.find('') + 1:],"yuanwen2":text[text.find('') + 1:]})
print(i.split("")[1] + "\n" + text.split("")[1])
except Exception as e:
print("发生异常:",e)
finally:
# if(count>=300):
# break
vectorstore.delete(ids=uuids)
print("已删除")
print(reslist)
return reslist

712
服务器文件/json_repair.py

@ -0,0 +1,712 @@
"""
This module will parse the JSON file following the BNF definition:
<json> ::= <container>
<primitive> ::= <number> | <string> | <boolean>
; Where:
; <number> is a valid real number expressed in one of a number of given formats
; <string> is a string of valid characters enclosed in quotes
; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
<container> ::= <object> | <array>
<array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
- Add the missing parentheses if the parser believes that the array or object should be closed
- Quote strings or add missing single quotes
- Adjust whitespaces and remove line breaks
All supported use cases are in the unit tests
"""
import os
import json
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
class StringFileWrapper:
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
def __init__(self, fd: TextIO) -> None:
self.fd = fd
self.length: int = 0
def __getitem__(self, index: Union[int, slice]) -> str:
if isinstance(index, slice):
self.fd.seek(index.start)
value = self.fd.read(index.stop - index.start)
self.fd.seek(index.start)
return value
else:
self.fd.seek(index)
return self.fd.read(1)
def __len__(self) -> int:
if self.length < 1:
current_position = self.fd.tell()
self.fd.seek(0, os.SEEK_END)
self.length = self.fd.tell()
self.fd.seek(current_position)
return self.length
class LoggerConfig:
# This is a type class to simplify the declaration
def __init__(self, log_level: Optional[str]):
self.log: List[Dict[str, str]] = []
self.window: int = 10
self.log_level: str = log_level if log_level else "none"
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
class JSONParser:
def __init__(
self,
json_str: Union[str, StringFileWrapper],
json_fd: Optional[TextIO],
logging: Optional[bool],
) -> None:
# The string to parse
self.json_str = json_str
# Alternatively, the file description with a json file in it
if json_fd:
# This is a trick we do to treat the file wrapper as an array
self.json_str = StringFileWrapper(json_fd)
# Index is our iterator that will keep track of which character we are looking at right now
self.index: int = 0
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
self.context: list[str] = []
# Use this to log the activity, but only if logging is active
self.logger = LoggerConfig(log_level="info" if logging else None)
def parse(
self,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
json = self.parse_json()
if self.index < len(self.json_str):
self.log(
"The parser returned early, checking if there's more json elements",
"info",
)
json = [json]
last_index = self.index
while self.index < len(self.json_str):
j = self.parse_json()
if j != "":
json.append(j)
if self.index == last_index:
self.index += 1
last_index = self.index
# If nothing extra was found, don't return an array
if len(json) == 1:
self.log(
"There were no more elements, returning the element without the array",
"info",
)
json = json[0]
if self.logger.log_level == "none":
return json
else:
return json, self.logger.log
def parse_json(
self,
) -> JSONReturnType:
while True:
char = self.get_char_at()
# This parser will ignore any basic element (string or number) that is not inside an array or object
is_in_context = len(self.context) > 0
# False means that we are at the end of the string provided
if char is False:
return ""
# <object> starts with '{'
elif char == "{":
self.index += 1
return self.parse_object()
# <array> starts with '['
elif char == "[":
self.index += 1
return self.parse_array()
# there can be an edge case in which a key is empty and at the end of an object
# like "key": }. We return an empty string here to close the object properly
elif char == "}":
self.log(
"At the end of an object we found a key with missing value, skipping",
"info",
)
return ""
# <string> starts with a quote
elif is_in_context and (char in ['"', "'", ""] or char.isalpha()):
return self.parse_string()
# <number> starts with [0-9] or minus
elif is_in_context and (char.isdigit() or char == "-" or char == "."):
return self.parse_number()
# If everything else fails, we just ignore and move on
else:
self.index += 1
def parse_object(self) -> Dict[str, Any]:
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
obj = {}
# Stop when you either find the closing parentheses or you have iterated over the entire string
while (self.get_char_at() or "}") != "}":
# This is what we expect to find:
# <member> ::= <string> ': ' <json>
# Skip filler whitespaces
self.skip_whitespaces_at()
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
if (self.get_char_at() or "") == ":":
self.log(
"While parsing an object we found a : before a key, ignoring",
"info",
)
self.index += 1
# We are now searching for they string key
# Context is used in the string parser to manage the lack of quotes
self.set_context("object_key")
self.skip_whitespaces_at()
# <member> starts with a <string>
key = ""
while self.get_char_at():
key = str(self.parse_string())
if key != "" or (key == "" and self.get_char_at() == ":"):
# If the string is empty but there is a object divider, we are done here
break
self.skip_whitespaces_at()
# We reached the end here
if (self.get_char_at() or "}") == "}":
continue
self.skip_whitespaces_at()
# An extreme case of missing ":" after a key
if (self.get_char_at() or "") != ":":
self.log(
"While parsing an object we missed a : after a key",
"info",
)
self.index += 1
self.reset_context()
self.set_context("object_value")
# The value can be any valid json
value = self.parse_json()
# Reset context since our job is done
self.reset_context()
obj[key] = value
if (self.get_char_at() or "") in [",", "'", '"']:
self.index += 1
# Remove trailing spaces
self.skip_whitespaces_at()
self.index += 1
return obj
def parse_array(self) -> List[Any]:
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
arr = []
self.set_context("array")
# Stop when you either find the closing parentheses or you have iterated over the entire string
while (self.get_char_at() or "]") != "]":
self.skip_whitespaces_at()
value = self.parse_json()
# It is possible that parse_json() returns nothing valid, so we stop
if value == "":
break
if value == "..." and self.get_char_at(-1) == ".":
self.log(
"While parsing an array, found a stray '...'; ignoring it", "info"
)
else:
arr.append(value)
# skip over whitespace after a value but before closing ]
char = self.get_char_at()
while char and (char.isspace() or char == ","):
self.index += 1
char = self.get_char_at()
# Especially at the end of an LLM generated json you might miss the last "]"
char = self.get_char_at()
if char and char != "]":
self.log(
"While parsing an array we missed the closing ], adding it back", "info"
)
self.index -= 1
self.index += 1
self.reset_context()
return arr
def parse_string(self) -> Union[str, bool, None]:
# <string> is a string of valid characters enclosed in quotes
# i.e. { name: "John" }
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
# Flag to manage corner cases related to missing starting quote
missing_quotes = False
doubled_quotes = False
lstring_delimiter = rstring_delimiter = '"'
char = self.get_char_at()
# A valid string can only start with a valid quote or, in our case, with a literal
while char and char not in ['"', "'", ""] and not char.isalnum():
self.index += 1
char = self.get_char_at()
if not char:
# This is an empty string
return ""
# Ensuring we use the right delimiter
if char == "'":
lstring_delimiter = rstring_delimiter = "'"
elif char == "":
lstring_delimiter = ""
rstring_delimiter = ""
elif char.isalnum():
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
# But remember, object keys are only of type string
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
value = self.parse_boolean_or_null()
if value != "":
return value
self.log(
"While parsing a string, we found a literal instead of a quote",
"info",
)
self.log(
"While parsing a string, we found no starting quote. Will add the quote back",
"info",
)
missing_quotes = True
if not missing_quotes:
self.index += 1
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
if self.get_char_at() == lstring_delimiter:
# If it's an empty key, this was easy
if self.get_context() == "object_key" and self.get_char_at(1) == ":":
self.index += 1
return ""
# Find the next delimiter
i = 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
# Now check that the next character is also a delimiter to ensure that we have "".....""
# In that case we ignore this rstring delimiter
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
self.log(
"While parsing a string, we found a valid starting doubled quote, ignoring it",
"info",
)
doubled_quotes = True
self.index += 1
else:
# Ok this is not a doubled quote, check if this is an empty string or not
i = 1
next_c = self.get_char_at(i)
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c not in [",", "]", "}"]:
self.log(
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
"info",
)
self.index += 1
# Initialize our return value
string_acc = ""
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
# In that case we need to use the ":|,|}" characters as terminators of the string
# So this will stop if:
# * It finds a closing quote
# * It iterated over the entire sequence
# * If we are fixing missing quotes in an object, when it finds the special terminators
char = self.get_char_at()
while char and char != rstring_delimiter:
if missing_quotes:
if self.get_context() == "object_key" and (
char == ":" or char.isspace()
):
self.log(
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
"info",
)
break
elif self.get_context() == "object_value" and char in [",", "}"]:
rstring_delimiter_missing = True
# check if this is a case in which the closing comma is NOT missing instead
i = 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
if next_c:
i += 1
next_c = self.get_char_at(i)
# found a delimiter, now we need to check that is followed strictly by a comma or brace
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c and next_c in [",", "}"]:
rstring_delimiter_missing = False
if rstring_delimiter_missing:
self.log(
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
"info",
)
break
string_acc += char
self.index += 1
char = self.get_char_at()
if char and len(string_acc) > 0 and string_acc[-1] == "\\":
# This is a special case, if people use real strings this might happen
self.log("Found a stray escape sequence, normalizing it", "info")
string_acc = string_acc[:-1]
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
string_acc += escape_seqs.get(char, char) or char
self.index += 1
char = self.get_char_at()
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
if char == rstring_delimiter:
# Special case here, in case of double quotes one after another
if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
self.log(
"While parsing a string, we found a doubled quote, ignoring it",
"info",
)
self.index += 1
elif missing_quotes and self.get_context() == "object_value":
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
i = 1
next_c = self.get_char_at(i)
while next_c and next_c not in [
rstring_delimiter,
lstring_delimiter,
]:
i += 1
next_c = self.get_char_at(i)
if next_c:
# We found a quote, now let's make sure there's a ":" following
i += 1
next_c = self.get_char_at(i)
# found a delimiter, now we need to check that is followed strictly by a comma or brace
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c and next_c == ":":
# Reset the cursor
self.index -= 1
char = self.get_char_at()
self.log(
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
"info",
)
break
else:
# Check if eventually there is a rstring delimiter, otherwise we bail
i = 1
next_c = self.get_char_at(i)
check_comma_in_object_value = True
while next_c and next_c not in [
rstring_delimiter,
lstring_delimiter,
]:
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
# This is because the routine after will make sure to correct any bad guess and this solves a corner case
if check_comma_in_object_value and next_c.isalpha():
check_comma_in_object_value = False
# If we are in an object context, let's check for the right delimiters
if (
("object_key" in self.context and next_c in [":", "}"])
or ("object_value" in self.context and next_c == "}")
or ("array" in self.context and next_c in ["]", ","])
or (
check_comma_in_object_value
and self.get_context() == "object_value"
and next_c == ","
)
):
break
i += 1
next_c = self.get_char_at(i)
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
if next_c == "," and self.get_context() == "object_value":
i += 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
i += 1
next_c = self.get_char_at(i)
while next_c and next_c.isspace():
i += 1
next_c = self.get_char_at(i)
if next_c == "}":
# OK this is valid then
self.log(
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
"info",
)
string_acc += str(char)
self.index += 1
char = self.get_char_at()
elif next_c == rstring_delimiter:
if self.get_context() == "object_value":
# But this might not be it! This could be just a missing comma
# We found a delimiter and we need to check if this is a key
# so find a rstring_delimiter and a colon after
i += 1
next_c = self.get_char_at(i)
while next_c and next_c != rstring_delimiter:
i += 1
next_c = self.get_char_at(i)
i += 1
next_c = self.get_char_at(i)
while next_c and next_c != ":":
if next_c in [
lstring_delimiter,
rstring_delimiter,
",",
]:
break
i += 1
next_c = self.get_char_at(i)
# Only if we fail to find a ':' then we know this is misplaced quote
if next_c != ":":
self.log(
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
"info",
)
string_acc += str(char)
self.index += 1
char = self.get_char_at()
if (
char
and missing_quotes
and self.get_context() == "object_key"
and char.isspace()
):
self.log(
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
"info",
)
self.skip_whitespaces_at()
if self.get_char_at() not in [":", ","]:
return ""
# A fallout of the previous special case in the while loop,
# we need to update the index only if we had a closing quote
if char != rstring_delimiter:
self.log(
"While parsing a string, we missed the closing quote, ignoring",
"info",
)
else:
self.index += 1
return string_acc.rstrip()
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
# <number> is a valid real number expressed in one of a number of given formats
number_str = ""
number_chars = set("0123456789-.eE/,")
char = self.get_char_at()
is_array = self.get_context() == "array"
while char and char in number_chars and (char != "," or not is_array):
number_str += char
self.index += 1
char = self.get_char_at()
if len(number_str) > 1 and number_str[-1] in "-eE/,":
# The number ends with a non valid character for a number/currency, rolling back one
number_str = number_str[:-1]
self.index -= 1
try:
if "," in number_str:
return str(number_str)
if "." in number_str or "e" in number_str or "E" in number_str:
return float(number_str)
elif number_str == "-":
# If there is a stray "-" this will throw an exception, throw away this character
return self.parse_json()
else:
return int(number_str)
except ValueError:
return number_str
def parse_boolean_or_null(self) -> Union[bool, str, None]:
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
starting_index = self.index
char = (self.get_char_at() or "").lower()
value: Optional[Tuple[str, Optional[bool]]]
if char == "t":
value = ("true", True)
elif char == "f":
value = ("false", False)
elif char == "n":
value = ("null", None)
if value:
i = 0
while char and i < len(value[0]) and char == value[0][i]:
i += 1
self.index += 1
char = (self.get_char_at() or "").lower()
if i == len(value[0]):
return value[1]
# If nothing works reset the index before returning
self.index = starting_index
return ""
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
try:
return self.json_str[self.index + count]
except IndexError:
return False
def skip_whitespaces_at(self) -> None:
"""
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
"""
try:
char = self.json_str[self.index]
except IndexError:
return
while char.isspace():
self.index += 1
try:
char = self.json_str[self.index]
except IndexError:
return
def set_context(self, value: str) -> None:
# If a value is provided update the context variable and save in stack
if value:
self.context.append(value)
def reset_context(self) -> None:
self.context.pop()
def get_context(self) -> str:
return self.context[-1]
def log(self, text: str, level: str) -> None:
if level == self.logger.log_level:
context = ""
start = max(self.index - self.logger.window, 0)
end = min(self.index + self.logger.window, len(self.json_str))
context = self.json_str[start:end]
self.logger.log.append(
{
"text": text,
"context": context,
}
)
def repair_json(
json_str: str = "",
return_objects: bool = False,
skip_json_loads: bool = False,
logging: bool = False,
json_fd: Optional[TextIO] = None,
ensure_ascii: bool = True,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
It will return the fixed string by default.
When `return_objects=True` is passed, it will return the decoded data structure instead.
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
"""
parser = JSONParser(json_str, json_fd, logging)
if skip_json_loads:
parsed_json = parser.parse()
else:
try:
if json_fd:
parsed_json = json.load(json_fd)
else:
parsed_json = json.loads(json_str)
except json.JSONDecodeError:
parsed_json = parser.parse()
# It's useful to return the actual object instead of the json string,
# it allows this lib to be a replacement of the json library
if return_objects or logging:
return parsed_json
return json.dumps(parsed_json, ensure_ascii=ensure_ascii)
def loads(
json_str: str,
skip_json_loads: bool = False,
logging: bool = False,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
This function works like `json.loads()` except that it will fix your JSON in the process.
It is a wrapper around the `repair_json()` function with `return_objects=True`.
"""
return repair_json(
json_str=json_str,
return_objects=True,
skip_json_loads=skip_json_loads,
logging=logging,
)
def load(
fd: TextIO, skip_json_loads: bool = False, logging: bool = False
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
This function works like `json.load()` except that it will fix your JSON in the process.
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
"""
return repair_json(
json_fd=fd,
return_objects=True,
skip_json_loads=skip_json_loads,
logging=logging,
)
def from_file(
filename: str,
skip_json_loads: bool = False,
logging: bool = False,
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
"""
This function is a wrapper around `load()` so you can pass the filename as string
"""
fd = open(filename)
jsonobj = load(fd, skip_json_loads, logging)
fd.close()
return jsonobj

45
服务器文件/main.py

@ -0,0 +1,45 @@
from flask import Flask, request, jsonify
import os
from checkPlaceName import checkPlaceName
# from checkRepeatText import checkRepeatText
from checkCompanyName import checkCompanyName
from checkDocumentError import getDocumentError
app = Flask(__name__)
UPLOAD_FOLDER = 'uploads'
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files['file']
if file.filename == '':
return jsonify({"error": "No selected file"}), 400
if file:
filename = file.filename
file.save(os.path.join(UPLOAD_FOLDER,filename))
return jsonify({"message": "File uploaded successfully"}), 200
@app.route('/getDocumentError', methods=['GET'])
def getDocumentErrorWeb():
filename = request.args.get('filename')
return getDocumentError(filename)
@app.route('/checkPlaceName', methods=['GET'])
def checkPlaceNameWeb():
filename = request.args.get('filename')
return checkPlaceName(filename)
@app.route('/checkRepeatText', methods=['GET'])
def checkRepeatTextWeb():
filename = request.args.get('filename')
sectionName=request.args.get('sectionName')
return checkRepeatText(filename,sectionName)
@app.route('/checkCompanyName', methods=['GET'])
def checkCompanyNameWeb():
filename = request.args.get('filename')
return checkCompanyName(filename)
@app.route('/test/<filename>', methods=['GET'])
def test(filename):
return filename
if __name__ == '__main__':
app.run(host="0.0.0.0",port=80)
Loading…
Cancel
Save