@ -0,0 +1,8 @@ |
|||||
|
# 默认忽略的文件 |
||||
|
/shelf/ |
||||
|
/workspace.xml |
||||
|
# 基于编辑器的 HTTP 客户端请求 |
||||
|
/httpRequests/ |
||||
|
# Datasource local storage ignored files |
||||
|
/dataSources/ |
||||
|
/dataSources.local.xml |
@ -0,0 +1,6 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="Encoding"> |
||||
|
<file url="file://$PROJECT_DIR$/ce.txt" charset="GBK" /> |
||||
|
</component> |
||||
|
</project> |
@ -0,0 +1,6 @@ |
|||||
|
<component name="InspectionProjectProfileManager"> |
||||
|
<settings> |
||||
|
<option name="USE_PROJECT_PROFILE" value="false" /> |
||||
|
<version value="1.0" /> |
||||
|
</settings> |
||||
|
</component> |
@ -0,0 +1,7 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="Black"> |
||||
|
<option name="sdkName" value="Python 3.9 (venv) (2)" /> |
||||
|
</component> |
||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (venv) (2)" project-jdk-type="Python SDK" /> |
||||
|
</project> |
@ -0,0 +1,8 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="ProjectModuleManager"> |
||||
|
<modules> |
||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/python项目39.iml" filepath="$PROJECT_DIR$/.idea/python项目39.iml" /> |
||||
|
</modules> |
||||
|
</component> |
||||
|
</project> |
@ -0,0 +1,10 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<module type="PYTHON_MODULE" version="4"> |
||||
|
<component name="NewModuleRootManager"> |
||||
|
<content url="file://$MODULE_DIR$"> |
||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" /> |
||||
|
</content> |
||||
|
<orderEntry type="inheritedJdk" /> |
||||
|
<orderEntry type="sourceFolder" forTests="false" /> |
||||
|
</component> |
||||
|
</module> |
@ -0,0 +1,258 @@ |
|||||
|
#!/usr/bin/env python |
||||
|
# -*- coding: utf-8 -*- |
||||
|
# Created by Charles on 2018/10/10 |
||||
|
# Function: |
||||
|
|
||||
|
import sys |
||||
|
import requests |
||||
|
from bs4 import BeautifulSoup |
||||
|
|
||||
|
|
||||
|
ABSTRACT_MAX_LENGTH = 300 # abstract max length |
||||
|
|
||||
|
user_agents = [ |
||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', |
||||
|
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', |
||||
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' |
||||
|
' Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36', |
||||
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 ' |
||||
|
'(KHTML, like Gecko) QtWeb Internet Browser/3.7 http://www.QtWeb.net', |
||||
|
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) ' |
||||
|
'Chrome/41.0.2228.0 Safari/537.36', |
||||
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, ' |
||||
|
'like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2', |
||||
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) ' |
||||
|
'Gecko/20070404 K-Ninja/2.1.3', |
||||
|
'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; ' |
||||
|
'en-US) iNet Browser 4.7', |
||||
|
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', |
||||
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) ' |
||||
|
'Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866' |
||||
|
] |
||||
|
|
||||
|
# 请求头信息 |
||||
|
HEADERS = { |
||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", |
||||
|
"Content-Type": "application/x-www-form-urlencoded", |
||||
|
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', |
||||
|
"Referer": "https://www.baidu.com/", |
||||
|
"Accept-Encoding": "gzip, deflate", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9" |
||||
|
} |
||||
|
|
||||
|
baidu_host_url = "https://www.baidu.com" |
||||
|
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&wd=" |
||||
|
|
||||
|
session = requests.Session() |
||||
|
session.headers = HEADERS |
||||
|
|
||||
|
|
||||
|
def search(keyword, num_results=10, debug=0): |
||||
|
""" |
||||
|
通过关键字进行搜索 |
||||
|
:param keyword: 关键字 |
||||
|
:param num_results: 指定返回的结果个数 |
||||
|
:return: 结果列表 |
||||
|
""" |
||||
|
if not keyword: |
||||
|
return None |
||||
|
|
||||
|
list_result = [] |
||||
|
page = 1 |
||||
|
|
||||
|
# 起始搜索的url |
||||
|
next_url = baidu_search_url + keyword |
||||
|
|
||||
|
# 循环遍历每一页的搜索结果,并返回下一页的url |
||||
|
while len(list_result) < num_results: |
||||
|
data, next_url = parse_html(next_url, rank_start=len(list_result)) |
||||
|
if data: |
||||
|
list_result += data |
||||
|
if debug: |
||||
|
print("---searching[{}], finish parsing page {}, results number={}: ".format(keyword, page, len(data))) |
||||
|
for d in data: |
||||
|
print(str(d)) |
||||
|
|
||||
|
if not next_url: |
||||
|
if debug: |
||||
|
print(u"already search the last page。") |
||||
|
break |
||||
|
page += 1 |
||||
|
|
||||
|
if debug: |
||||
|
print("\n---search [{}] finished. total results number={}!".format(keyword, len(list_result))) |
||||
|
return list_result[: num_results] if len(list_result) > num_results else list_result |
||||
|
|
||||
|
|
||||
|
def parse_html(url, rank_start=0, debug=0): |
||||
|
""" |
||||
|
解析处理结果 |
||||
|
:param url: 需要抓取的 url |
||||
|
:return: 结果列表,下一页的url |
||||
|
""" |
||||
|
try: |
||||
|
res = session.get(url=url) |
||||
|
res.encoding = "utf-8" |
||||
|
root = BeautifulSoup(res.text, "lxml") |
||||
|
|
||||
|
list_data = [] |
||||
|
div_contents = root.find("div", id="content_left") |
||||
|
for div in div_contents.contents: |
||||
|
if type(div) != type(div_contents): |
||||
|
continue |
||||
|
|
||||
|
class_list = div.get("class", []) |
||||
|
if not class_list: |
||||
|
continue |
||||
|
|
||||
|
if "c-container" not in class_list: |
||||
|
continue |
||||
|
|
||||
|
title = '' |
||||
|
url = '' |
||||
|
abstract = '' |
||||
|
try: |
||||
|
# 遍历所有找到的结果,取得标题和概要内容(50字以内) |
||||
|
if "xpath-log" in class_list: |
||||
|
if div.h3: |
||||
|
title = div.h3.text.strip() |
||||
|
url = div.h3.a['href'].strip() |
||||
|
else: |
||||
|
title = div.text.strip().split("\n", 1)[0] |
||||
|
if div.a: |
||||
|
url = div.a['href'].strip() |
||||
|
|
||||
|
if div.find("div", class_="c-abstract"): |
||||
|
abstract = div.find("div", class_="c-abstract").text.strip() |
||||
|
elif div.div: |
||||
|
abstract = div.div.text.strip() |
||||
|
else: |
||||
|
abstract = div.text.strip().split("\n", 1)[1].strip() |
||||
|
elif "result-op" in class_list: |
||||
|
if div.h3: |
||||
|
title = div.h3.text.strip() |
||||
|
url = div.h3.a['href'].strip() |
||||
|
else: |
||||
|
title = div.text.strip().split("\n", 1)[0] |
||||
|
url = div.a['href'].strip() |
||||
|
if div.find("div", class_="c-abstract"): |
||||
|
abstract = div.find("div", class_="c-abstract").text.strip() |
||||
|
elif div.div: |
||||
|
abstract = div.div.text.strip() |
||||
|
else: |
||||
|
# abstract = div.text.strip() |
||||
|
abstract = div.text.strip().split("\n", 1)[1].strip() |
||||
|
else: |
||||
|
if div.get("tpl", "") != "se_com_default": |
||||
|
if div.get("tpl", "") == "se_st_com_abstract": |
||||
|
if len(div.contents) >= 1: |
||||
|
title = div.h3.text.strip() |
||||
|
if div.find("div", class_="c-abstract"): |
||||
|
abstract = div.find("div", class_="c-abstract").text.strip() |
||||
|
elif div.div: |
||||
|
abstract = div.div.text.strip() |
||||
|
else: |
||||
|
abstract = div.text.strip() |
||||
|
else: |
||||
|
if len(div.contents) >= 2: |
||||
|
if div.h3: |
||||
|
title = div.h3.text.strip() |
||||
|
url = div.h3.a['href'].strip() |
||||
|
else: |
||||
|
title = div.contents[0].text.strip() |
||||
|
url = div.h3.a['href'].strip() |
||||
|
# abstract = div.contents[-1].text |
||||
|
if div.find("div", class_="c-abstract"): |
||||
|
abstract = div.find("div", class_="c-abstract").text.strip() |
||||
|
elif div.div: |
||||
|
abstract = div.div.text.strip() |
||||
|
else: |
||||
|
abstract = div.text.strip() |
||||
|
else: |
||||
|
if div.h3: |
||||
|
title = div.h3.text.strip() |
||||
|
url = div.h3.a['href'].strip() |
||||
|
else: |
||||
|
title = div.contents[0].text.strip() |
||||
|
url = div.h3.a['href'].strip() |
||||
|
if div.find("div", class_="c-abstract"): |
||||
|
abstract = div.find("div", class_="c-abstract").text.strip() |
||||
|
elif div.div: |
||||
|
abstract = div.div.text.strip() |
||||
|
else: |
||||
|
abstract = div.text.strip() |
||||
|
except Exception as e: |
||||
|
if debug: |
||||
|
print("catch exception duration parsing page html, e={}".format(e)) |
||||
|
continue |
||||
|
|
||||
|
if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH: |
||||
|
abstract = abstract[:ABSTRACT_MAX_LENGTH] |
||||
|
|
||||
|
rank_start+=1 |
||||
|
list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start}) |
||||
|
|
||||
|
|
||||
|
# 找到下一页按钮 |
||||
|
next_btn = root.find_all("a", class_="n") |
||||
|
|
||||
|
# 已经是最后一页了,没有下一页了,此时只返回数据不再获取下一页的链接 |
||||
|
if len(next_btn) <= 0 or u"上一页" in next_btn[-1].text: |
||||
|
return list_data, None |
||||
|
|
||||
|
next_url = baidu_host_url + next_btn[-1]["href"] |
||||
|
return list_data, next_url |
||||
|
except Exception as e: |
||||
|
if debug: |
||||
|
print(u"catch exception duration parsing page html, e:{}".format(e)) |
||||
|
return None, None |
||||
|
|
||||
|
|
||||
|
def run(): |
||||
|
""" |
||||
|
主程序入口,支持命令得带参执行或者手动输入关键字 |
||||
|
:return: |
||||
|
""" |
||||
|
default_keyword = u"长风破浪小武哥" |
||||
|
num_results = 10 |
||||
|
debug = 0 |
||||
|
|
||||
|
prompt = """ |
||||
|
baidusearch: not enough arguments |
||||
|
[0]keyword: keyword what you want to search |
||||
|
[1]num_results: number of results |
||||
|
[2]debug: debug switch, 0-close, 1-open, default-0 |
||||
|
eg: baidusearch NBA |
||||
|
baidusearch NBA 6 |
||||
|
baidusearch NBA 8 1 |
||||
|
""" |
||||
|
if len(sys.argv) > 3: |
||||
|
keyword = sys.argv[1] |
||||
|
try: |
||||
|
num_results = int(sys.argv[2]) |
||||
|
debug = int(sys.argv[3]) |
||||
|
except: |
||||
|
pass |
||||
|
elif len(sys.argv) > 1: |
||||
|
keyword = sys.argv[1] |
||||
|
else: |
||||
|
print(prompt) |
||||
|
keyword = input("please input keyword: ") |
||||
|
# sys.exit(1) |
||||
|
|
||||
|
if not keyword: |
||||
|
keyword = default_keyword |
||||
|
|
||||
|
print("---start search: [{}], expected number of results:[{}].".format(keyword, num_results)) |
||||
|
results = search(keyword, num_results=num_results, debug=debug) |
||||
|
|
||||
|
if isinstance(results, list): |
||||
|
print("search results:(total[{}]items.)".format(len(results))) |
||||
|
for res in results: |
||||
|
print("{}. {}\n {}\n {}".format(res['rank'], res["title"], res["abstract"], res["url"])) |
||||
|
else: |
||||
|
print("start search: [{}] failed.".format(keyword)) |
||||
|
|
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
run() |
@ -0,0 +1,64 @@ |
|||||
|
from qwen_agent.agents import Assistant |
||||
|
# from qwen_agent.agents.doc_qa import ParallelDocQA |
||||
|
|
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
||||
|
) |
||||
|
prompt=''' |
||||
|
请找是描述项目建设的章节名称 |
||||
|
''' |
||||
|
messages = [{'role': 'user', 'content': [{'text': prompt}, {'file': ''}]}] |
||||
|
for rsp in bot.run(messages): |
||||
|
print(rsp) |
||||
|
# messages = [{'role': 'user', 'content': [{'text':prompt}]}] |
||||
|
# runList=[] |
||||
|
# for rsp in bot.run(messages): |
||||
|
# print(rsp) |
||||
|
import re |
||||
|
# from docx import Document |
||||
|
# |
||||
|
# document = Document('747991ddb29a49da903210959076bb9f.docx') |
||||
|
# # 逐段读取docx文档的内容 |
||||
|
# levelList = [] |
||||
|
# words = [] |
||||
|
# addStart = False |
||||
|
# levelText = "" |
||||
|
# i = 0 |
||||
|
# for paragraph in document.paragraphs: |
||||
|
# # 判断该段落的标题级别 |
||||
|
# # 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
# text = paragraph.text |
||||
|
# if text.strip(): # 非空判断 |
||||
|
# # print("非空") |
||||
|
# words.append(text) |
||||
|
# # level = isTitle(paragraph) |
||||
|
# # if(addStart and level=="0"): |
||||
|
# # addStart=False |
||||
|
# # if(level=="0" and text.find("详细设计方案")>=0): |
||||
|
# # addStart=True |
||||
|
# # if level: |
||||
|
# # levelList.append("{}:".format(level)+paragraph.text) |
||||
|
# # levelText=text |
||||
|
# # else: |
||||
|
# # if addStart: |
||||
|
# # if(text.startswith("图") or text.startswith("注:")): |
||||
|
# # continue |
||||
|
# # i=i+1 |
||||
|
# # words.append("第{}个段落:".format(i)+text) |
||||
|
# |
||||
|
# # 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
# print(len(words)) |
||||
|
# text = '\n'.join(words) |
||||
|
# paragraphs = re.findall(r'.*?' + re.escape('宁波市') + r'.*?\n', text) |
||||
|
# print(paragraphs) |
||||
|
from langchain_community.document_loaders import TextLoader |
||||
|
|
||||
|
loader = TextLoader('checkRepeatText.txt') |
||||
|
docs = loader.load() |
@ -0,0 +1,205 @@ |
|||||
|
# -*- coding:utf-8 -*- |
||||
|
import time |
||||
|
from docx import Document |
||||
|
from paddlenlp import Taskflow |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import re |
||||
|
import json_repair |
||||
|
import math |
||||
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
||||
|
from docx.opc.oxml import parse_xml |
||||
|
|
||||
|
|
||||
|
def load_from_xml_v2(baseURI, rels_item_xml): |
||||
|
""" |
||||
|
Return |_SerializedRelationships| instance loaded with the |
||||
|
relationships contained in *rels_item_xml*. Returns an empty |
||||
|
collection if *rels_item_xml* is |None|. |
||||
|
""" |
||||
|
srels = _SerializedRelationships() |
||||
|
if rels_item_xml is not None: |
||||
|
rels_elm = parse_xml(rels_item_xml) |
||||
|
for rel_elm in rels_elm.Relationship_lst: |
||||
|
if rel_elm.target_ref in ('../NULL', 'NULL'): |
||||
|
continue |
||||
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
||||
|
return srels |
||||
|
|
||||
|
|
||||
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
||||
|
|
||||
|
|
||||
|
import logging |
||||
|
import logging.config |
||||
|
|
||||
|
log_config = { |
||||
|
'version': 1, |
||||
|
'disable_existing_loggers': False, |
||||
|
'formatters': { |
||||
|
'standard': { |
||||
|
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
||||
|
}, |
||||
|
}, |
||||
|
'handlers': { |
||||
|
'console': { |
||||
|
'class': 'logging.StreamHandler', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
'file': { |
||||
|
'class': 'logging.FileHandler', |
||||
|
'filename': 'Logger.log', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
}, |
||||
|
'loggers': { |
||||
|
'': { |
||||
|
'handlers': ['console', 'file'], |
||||
|
'level': logging.INFO, |
||||
|
'propagate': True, |
||||
|
}, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logging.config.dictConfig(log_config) |
||||
|
|
||||
|
logger = logging.getLogger("checkCompanyName") |
||||
|
prompt = ''' |
||||
|
.根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询, |
||||
|
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案, |
||||
|
回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
''' |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具" |
||||
|
) |
||||
|
|
||||
|
def getDocxToTextAll(name): |
||||
|
docxPath=name |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
# print("非空") |
||||
|
words.append(text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file: |
||||
|
txt_file.write(text) |
||||
|
def companyNameTask(text): |
||||
|
yield "文档公司或组织名称检查---启动中...." |
||||
|
wordtag = Taskflow("knowledge_mining",device_id=0) |
||||
|
batchNum=20 |
||||
|
sentences = re.split(r'[。\n]', text) |
||||
|
# 去掉空字符 |
||||
|
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
# 计算总字符数 |
||||
|
total_chars = len(sentences) |
||||
|
|
||||
|
# 计算有多少份 |
||||
|
num_chunks = math.ceil(total_chars / batchNum) |
||||
|
|
||||
|
# 按batchNum字为一份进行处理 |
||||
|
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
||||
|
placeList = [] |
||||
|
# 打印每一份的内容 |
||||
|
for i, chunk in enumerate(chunks): |
||||
|
yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}" |
||||
|
|
||||
|
wenBen=".".join(chunk) |
||||
|
try: |
||||
|
res = wordtag(wenBen) |
||||
|
except Exception as e: |
||||
|
logging.warning(chunk) |
||||
|
logging.warning("文档公司或组织名称检查---词类分析出错",e) |
||||
|
continue |
||||
|
isplace = False |
||||
|
for zuhe in res[0]['items']: |
||||
|
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
||||
|
zhi = zuhe.get("wordtag_label") |
||||
|
if isplace: |
||||
|
name = placeList[len(placeList) - 1] |
||||
|
if zhi.find("组织机构类") >= 0: # or zuhe[1] == "ns" |
||||
|
isplace = True |
||||
|
new_text = zuhe['item'].replace("\n", "") |
||||
|
placeList[len(placeList) - 1] = name + new_text |
||||
|
continue |
||||
|
if zhi.find("组织机构类") >= 0: |
||||
|
isplace = True |
||||
|
new_text = zuhe['item'].replace("\n", "") |
||||
|
placeList.append(new_text) |
||||
|
else: |
||||
|
isplace = False |
||||
|
# 打印总份数 |
||||
|
yield "文档公司或组织名称检查---文档解析完成" |
||||
|
placeList=list(dict.fromkeys(placeList)) |
||||
|
yield placeList |
||||
|
def checkCompanyName(filename): |
||||
|
yield f"文档公司或组织名称检查---开始处理文档..." |
||||
|
try: |
||||
|
getDocxToTextAll(filename) |
||||
|
except Exception as e: |
||||
|
logging.warning(e) |
||||
|
yield "文档公司或组织名称检查---文档无法打开,请检查文档内容" |
||||
|
return |
||||
|
with open("checkCompanyName.txt", "r", encoding='utf-8') as f: |
||||
|
gettext = f.read() |
||||
|
yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送 |
||||
|
for item in companyNameTask(gettext): |
||||
|
if isinstance(item, str): |
||||
|
yield item |
||||
|
else: |
||||
|
final_list = item # 获取最终结果 |
||||
|
propnStr = ",".join(final_list) |
||||
|
messages = [{'role': 'user', 'content': [{'text': propnStr+prompt}]}] |
||||
|
runList = [] |
||||
|
yield f"文档公司或组织名称检查---结果生成中..." # 每次生成一个数字就发送 |
||||
|
cishu = 0 |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
if cishu > 3: |
||||
|
cishu = 0 |
||||
|
yield "文档公司或组织名称检查---结果生成中" + '.' * cishu |
||||
|
cishu += 1 |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
error_places=[] |
||||
|
for place in parsed_data: |
||||
|
try: |
||||
|
if place['回答'] == '非泛化的公司或组织名称': |
||||
|
error_places.append(place) |
||||
|
except Exception as e: |
||||
|
logging.warning(place) |
||||
|
logging.warning("文档公司或组织名称检查---组织提出出错",e) |
||||
|
continue |
||||
|
logging.info(error_places) |
||||
|
returnInfo = "发现异常公司或组织名称<br>" |
||||
|
if len(error_places)>0: |
||||
|
for t in error_places: |
||||
|
keyword= t['companyName'].replace("\n","") |
||||
|
# 查找包含关键字的段落 |
||||
|
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
||||
|
t["yuanwen"]=paragraphs[0] |
||||
|
yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n","") |
||||
|
returnInfo += "原文:" + yuanwen + "<br>异常公司或组织名称:**" + keyword + "**!请注意" + "<br>" |
||||
|
logging.info(returnInfo) |
||||
|
yield returnInfo |
||||
|
else: |
||||
|
yield "**未发现异常公司或组织名称**<br>" |
@ -0,0 +1,220 @@ |
|||||
|
# -*- coding:utf-8 -*- |
||||
|
# from pycorrector import MacBertCorrector |
||||
|
# m = MacBertCorrector("shibing624/macbert4csc-base-chinese") |
||||
|
from qwen_agent.agents import Assistant |
||||
|
from docx import Document |
||||
|
from pprint import pprint |
||||
|
import re |
||||
|
from paddlenlp import Taskflow |
||||
|
import json |
||||
|
import time |
||||
|
import json_repair |
||||
|
import math |
||||
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
||||
|
from docx.opc.oxml import parse_xml |
||||
|
|
||||
|
import asyncio |
||||
|
def load_from_xml_v2(baseURI, rels_item_xml): |
||||
|
""" |
||||
|
Return |_SerializedRelationships| instance loaded with the |
||||
|
relationships contained in *rels_item_xml*. Returns an empty |
||||
|
collection if *rels_item_xml* is |None|. |
||||
|
""" |
||||
|
srels = _SerializedRelationships() |
||||
|
if rels_item_xml is not None: |
||||
|
rels_elm = parse_xml(rels_item_xml) |
||||
|
for rel_elm in rels_elm.Relationship_lst: |
||||
|
if rel_elm.target_ref in ('../NULL', 'NULL'): |
||||
|
continue |
||||
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
||||
|
return srels |
||||
|
|
||||
|
|
||||
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
||||
|
import logging |
||||
|
import logging.config |
||||
|
|
||||
|
log_config = { |
||||
|
'version': 1, |
||||
|
'disable_existing_loggers': False, |
||||
|
'formatters': { |
||||
|
'standard': { |
||||
|
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
||||
|
}, |
||||
|
}, |
||||
|
'handlers': { |
||||
|
'console': { |
||||
|
'class': 'logging.StreamHandler', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
'file': { |
||||
|
'class': 'logging.FileHandler', |
||||
|
'filename': 'Logger.log', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
}, |
||||
|
'loggers': { |
||||
|
'': { |
||||
|
'handlers': ['console', 'file'], |
||||
|
'level': logging.INFO, |
||||
|
'propagate': True, |
||||
|
}, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logging.config.dictConfig(log_config) |
||||
|
|
||||
|
logger = logging.getLogger("checkDocumentError") |
||||
|
llm_cfg = { |
||||
|
# 'model': 'qwen1.5-72b-chat', |
||||
|
'model': "qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
||||
|
|
||||
|
) |
||||
|
# prompt=''' |
||||
|
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案, |
||||
|
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
# ''' |
||||
|
prompt = ''' |
||||
|
请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出解析,没有错则不用给解析 |
||||
|
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答; |
||||
|
''' |
||||
|
|
||||
|
|
||||
|
def getDocxToTextAll(name): |
||||
|
docxPath = name |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList = [] |
||||
|
words = [] |
||||
|
addStart = False |
||||
|
levelText = "" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip(): # 非空判断 |
||||
|
# print("非空") |
||||
|
words.append(text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file: |
||||
|
txt_file.write(text) |
||||
|
|
||||
|
|
||||
|
def getDocumentError(filename): |
||||
|
yield f"文档纠错---开始处理文档..." |
||||
|
try: |
||||
|
getDocxToTextAll(filename) |
||||
|
except Exception as e: |
||||
|
logger.warning(e) |
||||
|
yield "文档无法打开,请检查文档内容" |
||||
|
return |
||||
|
with open("checkDocumentError.txt", "r", encoding='utf-8') as f: |
||||
|
gettext = f.read() |
||||
|
yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送 |
||||
|
final_list = [] |
||||
|
for item in documentErrorTask(gettext): |
||||
|
if isinstance(item, str): |
||||
|
yield item |
||||
|
else: |
||||
|
final_list = item # 获取最终结果 |
||||
|
resInfo = "发现错别字<br>" |
||||
|
if (len(final_list) > 0): |
||||
|
for i in final_list: |
||||
|
yuanwen = i["placeName"].replace("\n", "") |
||||
|
jianyi = i["jianyi"].replace("\n", "") |
||||
|
resInfo += "原文:" + yuanwen + "<br>建议:**" + jianyi + "**<br>" |
||||
|
yield resInfo |
||||
|
logger.info(resInfo) |
||||
|
else: |
||||
|
yield "**未发现错别字**" |
||||
|
|
||||
|
|
||||
|
def documentErrorTask(text): |
||||
|
""" |
||||
|
分批读取文本文件 |
||||
|
:param file_path: 文件路径 |
||||
|
:param batch_size: 每批处理的字符数 |
||||
|
:return: 生成器,每次返回一批文本 |
||||
|
""" |
||||
|
yield "文档纠错---启动中...." |
||||
|
corrector = Taskflow("text_correction", device_id=1) |
||||
|
batchNum = 20 |
||||
|
sentences = re.split(r'[。\n]', text) |
||||
|
# 去掉空字符 |
||||
|
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
# 计算总字符数 |
||||
|
total_chars = len(sentences) |
||||
|
|
||||
|
# 计算有多少份 |
||||
|
num_chunks = math.ceil(total_chars / batchNum) |
||||
|
|
||||
|
# 按batchNum字为一份进行处理 |
||||
|
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
||||
|
placeList = [] |
||||
|
# 打印每一份的内容 |
||||
|
err = [] |
||||
|
for i, chunk in enumerate(chunks): |
||||
|
yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}" |
||||
|
try: |
||||
|
res = corrector(chunk) |
||||
|
except Exception as e: |
||||
|
logger.warning(chunk) |
||||
|
logger.warning("文档纠错--错别字识别出错\n", e) |
||||
|
continue |
||||
|
lines_with_greeting = [place for place in res if len(place['errors']) > 0] |
||||
|
if len(lines_with_greeting) > 0: |
||||
|
num = 0 |
||||
|
wenti = [] # 记录问题的数组 |
||||
|
keyword_list = [] # 记录问题 |
||||
|
for t in lines_with_greeting: |
||||
|
temp_errorWords = [] |
||||
|
keyword = t['source'] |
||||
|
keyword_list.append(keyword) |
||||
|
for item in t["errors"]: |
||||
|
for key, value in item['correction'].items(): |
||||
|
temp_errorWords.append(key) |
||||
|
wenti.append( |
||||
|
"{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords))) |
||||
|
num += 1 |
||||
|
words = "\n".join(wenti) |
||||
|
messages = [{'role': 'user', 'content': [{'text': words + prompt}]}] |
||||
|
runList = [] |
||||
|
yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送 |
||||
|
cishu = 0 |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
if cishu > 3: |
||||
|
cishu = 0 |
||||
|
yield "文档纠错---内容解析中" + '.' * cishu |
||||
|
cishu += 1 |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
parsed_data = json_repair.loads(data.replace("\\", "").replace('`', '')) |
||||
|
resListerr = [] |
||||
|
for place in parsed_data: |
||||
|
try: |
||||
|
if place['回答'] == '是': |
||||
|
place["placeName"] = keyword_list[int(place["placeName"])] |
||||
|
place["jianyi"] = place["解析"] |
||||
|
resListerr.append(place) |
||||
|
except Exception as e: |
||||
|
logger.warning(parsed_data) |
||||
|
logger.warning(place) |
||||
|
logger.warning("文档纠错--错别字提取出错\n", e) |
||||
|
continue |
||||
|
if (len(resListerr) > 0): |
||||
|
err.extend(resListerr) |
||||
|
# 打印总份数 |
||||
|
yield "文档地名检查---文档解析完成" |
||||
|
yield err |
@ -0,0 +1,212 @@ |
|||||
|
from docx import Document |
||||
|
from paddlenlp import Taskflow |
||||
|
from pprint import pprint |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import re |
||||
|
import json_repair |
||||
|
import time |
||||
|
import math |
||||
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
||||
|
from docx.opc.oxml import parse_xml |
||||
|
|
||||
|
|
||||
|
def load_from_xml_v2(baseURI, rels_item_xml): |
||||
|
""" |
||||
|
Return |_SerializedRelationships| instance loaded with the |
||||
|
relationships contained in *rels_item_xml*. Returns an empty |
||||
|
collection if *rels_item_xml* is |None|. |
||||
|
""" |
||||
|
srels = _SerializedRelationships() |
||||
|
if rels_item_xml is not None: |
||||
|
rels_elm = parse_xml(rels_item_xml) |
||||
|
for rel_elm in rels_elm.Relationship_lst: |
||||
|
if rel_elm.target_ref in ('../NULL', 'NULL'): |
||||
|
continue |
||||
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
||||
|
return srels |
||||
|
|
||||
|
|
||||
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
||||
|
|
||||
|
|
||||
|
import logging |
||||
|
import logging.config |
||||
|
|
||||
|
log_config = { |
||||
|
'version': 1, |
||||
|
'disable_existing_loggers': False, |
||||
|
'formatters': { |
||||
|
'standard': { |
||||
|
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
||||
|
}, |
||||
|
}, |
||||
|
'handlers': { |
||||
|
'console': { |
||||
|
'class': 'logging.StreamHandler', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
'file': { |
||||
|
'class': 'logging.FileHandler', |
||||
|
'filename': 'Logger.log', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
}, |
||||
|
'loggers': { |
||||
|
'': { |
||||
|
'handlers': ['console', 'file'], |
||||
|
'level': logging.INFO, |
||||
|
'propagate': True, |
||||
|
}, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logging.config.dictConfig(log_config) |
||||
|
|
||||
|
logger = logging.getLogger("checkPlaceName") |
||||
|
|
||||
|
prompt=''' |
||||
|
.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
不做过多的解释,严格按回答格式作答; |
||||
|
''' |
||||
|
# prompt=''' |
||||
|
# .请回答以上问题, |
||||
|
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
# 不做过多的解释,严格按回答格式作答; |
||||
|
# ''' |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
||||
|
) |
||||
|
#获取全文内容 |
||||
|
def getDocxToTextAll(docxPath): |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
# print("非空") |
||||
|
words.append(text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file: |
||||
|
txt_file.write(text) |
||||
|
|
||||
|
#得到全文和地名有关的内容 |
||||
|
def placeNameTask(text): |
||||
|
yield "文档地名检查---启动中...." |
||||
|
tagTask = Taskflow("ner",device_id=2) |
||||
|
batchNum=20 |
||||
|
sentences = re.split(r'[。\n]', text) |
||||
|
# 去掉空字符 |
||||
|
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
# 计算总字符数 |
||||
|
total_chars = len(sentences) |
||||
|
|
||||
|
# 计算有多少份 |
||||
|
num_chunks = math.ceil(total_chars / batchNum) |
||||
|
|
||||
|
# 按batchNum字为一份进行处理 |
||||
|
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
||||
|
placeList = [] |
||||
|
# 打印每一份的内容 |
||||
|
for i, chunk in enumerate(chunks): |
||||
|
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}" |
||||
|
|
||||
|
wenBen=".".join(chunk) |
||||
|
try: |
||||
|
res = tagTask(wenBen) |
||||
|
except Exception as e: |
||||
|
logger.warning(chunk) |
||||
|
logger.warning("文档地名检查---解析地名出错",e) |
||||
|
continue |
||||
|
isplace = False |
||||
|
for zuhe in res: |
||||
|
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
||||
|
if isplace: |
||||
|
name = placeList[len(placeList) - 1] |
||||
|
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: # or zuhe[1] == "ns" |
||||
|
isplace = True |
||||
|
new_text = zuhe[0].replace("\n", "") |
||||
|
placeList[len(placeList) - 1] = name + new_text |
||||
|
continue |
||||
|
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: |
||||
|
isplace = True |
||||
|
new_text = zuhe[0].replace("\n", "") |
||||
|
placeList.append(new_text) |
||||
|
else: |
||||
|
isplace = False |
||||
|
# 打印总份数 |
||||
|
yield "文档地名检查---文档解析完成" |
||||
|
placeList=list(dict.fromkeys(placeList)) |
||||
|
yield placeList |
||||
|
#主方法 |
||||
|
def checkPlaceName(filename): |
||||
|
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送 |
||||
|
try: |
||||
|
getDocxToTextAll(filename) |
||||
|
except Exception as e: |
||||
|
logger.warning(e) |
||||
|
yield "文档地名检查---文档无法打开,请检查文档内容" |
||||
|
return |
||||
|
with open("checkPlaceName.txt", "r",encoding='utf-8') as f: |
||||
|
gettext = f.read() |
||||
|
yield f"文档地名检查---开始解析文档..." # 每次生成一个数字就发送 |
||||
|
# propnList=placeNameTask(gettext) |
||||
|
for item in placeNameTask(gettext): |
||||
|
if isinstance(item, str): |
||||
|
yield item |
||||
|
else: |
||||
|
final_list = item # 获取最终结果 |
||||
|
propnStr = ",".join(final_list) |
||||
|
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}] |
||||
|
runList = [] |
||||
|
yield f"文档地名检查---结果生成中..." # 每次生成一个数字就发送 |
||||
|
cishu=0 |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
if cishu>3: |
||||
|
cishu=0 |
||||
|
yield "文档地名检查---结果生成中"+'.'*cishu |
||||
|
cishu+=1 |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
error_places=[] |
||||
|
# 如果需要进一步操作,例如只关注“正确”的回答 |
||||
|
for place in parsed_data: |
||||
|
try: |
||||
|
if place['回答'] == '错误': |
||||
|
error_places.append(place) |
||||
|
except Exception as e: |
||||
|
logger.warning(place) |
||||
|
logger.warning("文档地名检查---组织提出出错",e) |
||||
|
continue |
||||
|
logger.info(error_places) |
||||
|
returnInfo = "发现异常地名<br>" |
||||
|
if len(error_places)>0: |
||||
|
for t in error_places: |
||||
|
keyword= t['placeName'].replace("\n","") |
||||
|
# 查找包含关键字的段落 |
||||
|
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
||||
|
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","") |
||||
|
returnInfo+="原文:" + yuanwen + "<br>出现异常地名:**" + keyword + "**!请注意" + "<br>" |
||||
|
yield returnInfo |
||||
|
logger.info(returnInfo) |
||||
|
else: |
||||
|
yield "**未发现发现异常地名**" |
@ -0,0 +1,292 @@ |
|||||
|
import uuid |
||||
|
from langchain_chroma import Chroma |
||||
|
from langchain_community.embeddings import DashScopeEmbeddings |
||||
|
from langchain_community.document_loaders import TextLoader |
||||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import json_repair |
||||
|
from paddlenlp import Taskflow |
||||
|
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") |
||||
|
device_id=0 |
||||
|
import re |
||||
|
import time |
||||
|
from docx import Document |
||||
|
import shutil |
||||
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
||||
|
from docx.opc.oxml import parse_xml |
||||
|
import logging |
||||
|
import logging.config |
||||
|
|
||||
|
log_config = { |
||||
|
'version': 1, |
||||
|
'disable_existing_loggers': False, |
||||
|
'formatters': { |
||||
|
'standard': { |
||||
|
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
||||
|
}, |
||||
|
}, |
||||
|
'handlers': { |
||||
|
'console': { |
||||
|
'class': 'logging.StreamHandler', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
'file': { |
||||
|
'class': 'logging.FileHandler', |
||||
|
'filename': 'Logger.log', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
}, |
||||
|
'loggers': { |
||||
|
'': { |
||||
|
'handlers': ['console', 'file'], |
||||
|
'level': logging.INFO, |
||||
|
'propagate': True, |
||||
|
}, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logging.config.dictConfig(log_config) |
||||
|
|
||||
|
logger = logging.getLogger("checkRepeatText") |
||||
|
|
||||
|
def load_from_xml_v2(baseURI, rels_item_xml): |
||||
|
""" |
||||
|
Return |_SerializedRelationships| instance loaded with the |
||||
|
relationships contained in *rels_item_xml*. Returns an empty |
||||
|
collection if *rels_item_xml* is |None|. |
||||
|
""" |
||||
|
srels = _SerializedRelationships() |
||||
|
if rels_item_xml is not None: |
||||
|
rels_elm = parse_xml(rels_item_xml) |
||||
|
for rel_elm in rels_elm.Relationship_lst: |
||||
|
if rel_elm.target_ref in ('../NULL', 'NULL'): |
||||
|
continue |
||||
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
||||
|
return srels |
||||
|
|
||||
|
|
||||
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
||||
|
# 记录程序开始的时间戳 |
||||
|
def getOutlineLevel(inputXml): |
||||
|
""" |
||||
|
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
||||
|
参数 inputXml |
||||
|
返回 number |
||||
|
""" |
||||
|
start_index = inputXml.find('<w:outlineLvl') |
||||
|
end_index = inputXml.find('>', start_index) |
||||
|
number = inputXml[start_index:end_index + 1] |
||||
|
number = re.search("\d+", number).group() |
||||
|
return number |
||||
|
|
||||
|
|
||||
|
def isTitle(paragraph): |
||||
|
""" |
||||
|
功能 判断该段落是否设置了大纲等级 |
||||
|
参数 paragraph:段落 |
||||
|
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
||||
|
""" |
||||
|
# 如果是空行,直接返回None |
||||
|
if paragraph.text.strip() == '': |
||||
|
return None |
||||
|
|
||||
|
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
||||
|
paragraphXml = paragraph._p.xml |
||||
|
if paragraphXml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(paragraphXml) |
||||
|
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
||||
|
targetStyle = paragraph.style |
||||
|
while targetStyle is not None: |
||||
|
# 如果在该级style中找到了大纲级别,返回 |
||||
|
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(targetStyle.element.xml) |
||||
|
else: |
||||
|
targetStyle = targetStyle.base_style |
||||
|
# 如果在段落、样式里都没有找到大纲级别,返回None |
||||
|
return None |
||||
|
|
||||
|
#寻找标题名称 |
||||
|
def findTitleName(docxPath): |
||||
|
yield '文档相似性检查----检查是否存在详细设计方案' |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
titleWords=[] |
||||
|
firstTitle = 0 |
||||
|
secondTitle = 0 |
||||
|
sanjiTitle = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
level = isTitle(paragraph) |
||||
|
if level=="0": |
||||
|
firstTitle+=1 |
||||
|
secondTitle = 0 |
||||
|
if(text.find("附件")>=0): |
||||
|
continue |
||||
|
titleWords.append("一级标题:".format(firstTitle)+text) |
||||
|
elif level=="1": |
||||
|
secondTitle+=1 |
||||
|
sanjiTitle=0 |
||||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
||||
|
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text) |
||||
|
elif level=="2": |
||||
|
sanjiTitle += 1 |
||||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
||||
|
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) |
||||
|
findTitleName_llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, |
||||
|
name='Assistant', |
||||
|
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' |
||||
|
) |
||||
|
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 |
||||
|
类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择 |
||||
|
类似设计方案,服务方案,建设方案为次相关,次级选择 |
||||
|
类似方案是最后选择 |
||||
|
按照这样的顺序选择最合适的 |
||||
|
你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答 |
||||
|
''' |
||||
|
# print("\n".join(titleWords)+prompt) |
||||
|
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})] |
||||
|
runList=[] |
||||
|
for rsp in findTitleName_bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
logger.info(parsed_data) |
||||
|
if(parsed_data["answer"]=="存在"): |
||||
|
yield parsed_data["name"] |
||||
|
else: |
||||
|
yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较" |
||||
|
#获取文档中 详细设计方案 章节的所有内容 |
||||
|
def getDocxToText(docxPath,titleName,vector_store_path): |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
if titleName: |
||||
|
level = isTitle(paragraph) |
||||
|
if(addStart and level=="0"): |
||||
|
addStart=False |
||||
|
if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)): |
||||
|
addStart=True |
||||
|
if level: |
||||
|
levelList.append("{}:".format(level)+paragraph.text) |
||||
|
levelText=f"{int(level)+1}级标题-"+text |
||||
|
else: |
||||
|
if addStart: |
||||
|
if(text.startswith("图") or text.startswith("注:")): |
||||
|
continue |
||||
|
if(len(text)>30): |
||||
|
i=i+1 |
||||
|
words.append("{}:".format(levelText)+text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
if len(words)==0: |
||||
|
raise Exception("checkRepeatText,获取长度为0") |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkRepeatText.txt", 'w', ) as txt_file: |
||||
|
txt_file.write(text) |
||||
|
time.sleep(3) |
||||
|
loader = TextLoader(file_path='checkRepeatText.txt') |
||||
|
docs = loader.load() |
||||
|
# print(docs) |
||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10, add_start_index=True, |
||||
|
separators=["\n\n", "\n"]) |
||||
|
|
||||
|
splits = text_splitter.split_documents(docs) |
||||
|
uuids = [] |
||||
|
for i in range(len(splits)): |
||||
|
uuids.append(str(uuid.uuid4())) |
||||
|
logging.info(f"checkRepeatTextuuidLen{len(uuids)}") |
||||
|
|
||||
|
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) |
||||
|
vectorstore.add_documents(documents=splits, ids=uuids) |
||||
|
while True: |
||||
|
time.sleep(0.3) |
||||
|
ress = vectorstore.similarity_search(words[0]) |
||||
|
if (len(ress) > 0): |
||||
|
break |
||||
|
return words,uuids,vectorstore |
||||
|
|
||||
|
|
||||
|
# @app.route('/checkRepeatText/<filename>', methods=['GET']) |
||||
|
def checkRepeatText(filename): |
||||
|
yield "文档相似性检查---启动中...." |
||||
|
vector_store_path="vector_store"+str(uuid.uuid4()) |
||||
|
for titleName in findTitleName(filename): |
||||
|
yield titleName |
||||
|
if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"): |
||||
|
try: |
||||
|
yield "文档相似性检查----文档内容解析中" |
||||
|
words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path) |
||||
|
except Exception as e: |
||||
|
yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败" |
||||
|
return |
||||
|
# 记录程序开始的时间戳‘ |
||||
|
global device_id |
||||
|
similarity = Taskflow("text_similarity",device_id=3) |
||||
|
# device_id+=1 |
||||
|
# if(device_id>1): |
||||
|
# device_id=0 |
||||
|
reslist = [] |
||||
|
count = 0 |
||||
|
for i in words: |
||||
|
count += 1 |
||||
|
yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}" |
||||
|
result = vectorstore.similarity_search(i) |
||||
|
textTag = i.split(":")[0] |
||||
|
for content in result: |
||||
|
text = content.page_content |
||||
|
tag = text.split(":")[0].replace('\n', '') |
||||
|
if (textTag.find(tag) >= 0): |
||||
|
continue |
||||
|
try: |
||||
|
res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) |
||||
|
except Exception as e: |
||||
|
logger.warning("文档相似性检查--发生异常:",e) |
||||
|
logger.warning(i) |
||||
|
logger.warning(text) |
||||
|
if (res[0]["similarity"] > 0.90): |
||||
|
# 判断重复内容是否被放入 |
||||
|
if (len(reslist) > 0): |
||||
|
isExist = False |
||||
|
for neirong in reslist: |
||||
|
if i in neirong.values(): |
||||
|
isExist = True |
||||
|
break |
||||
|
if not isExist: |
||||
|
# reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]}) |
||||
|
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) |
||||
|
else: |
||||
|
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) |
||||
|
# print(i.split(":")[1] + "\n" + text.split(":")[1]) |
||||
|
# vectorstore.delete(ids=uuids) |
||||
|
shutil.rmtree(vector_store_path) |
||||
|
logger.info("已删除") |
||||
|
logger.info(reslist) |
||||
|
resInfo=f"对{titleName}章节,发现相似内容:<br>" |
||||
|
if(len(reslist)>0): |
||||
|
for res in reslist: |
||||
|
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"<br>以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】<br>" |
||||
|
yield resInfo |
||||
|
logger.info(resInfo) |
||||
|
else: |
||||
|
yield "未发现相似内容" |
@ -0,0 +1,173 @@ |
|||||
|
from docx import Document |
||||
|
from pprint import pprint |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import re |
||||
|
import json_repair |
||||
|
import math |
||||
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
||||
|
from docx.opc.oxml import parse_xml |
||||
|
|
||||
|
|
||||
|
def load_from_xml_v2(baseURI, rels_item_xml): |
||||
|
""" |
||||
|
Return |_SerializedRelationships| instance loaded with the |
||||
|
relationships contained in *rels_item_xml*. Returns an empty |
||||
|
collection if *rels_item_xml* is |None|. |
||||
|
""" |
||||
|
srels = _SerializedRelationships() |
||||
|
if rels_item_xml is not None: |
||||
|
rels_elm = parse_xml(rels_item_xml) |
||||
|
for rel_elm in rels_elm.Relationship_lst: |
||||
|
if rel_elm.target_ref in ('../NULL', 'NULL'): |
||||
|
continue |
||||
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
||||
|
return srels |
||||
|
|
||||
|
|
||||
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
||||
|
import logging |
||||
|
import logging.config |
||||
|
|
||||
|
log_config = { |
||||
|
'version': 1, |
||||
|
'disable_existing_loggers': False, |
||||
|
'formatters': { |
||||
|
'standard': { |
||||
|
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
||||
|
}, |
||||
|
}, |
||||
|
'handlers': { |
||||
|
'console': { |
||||
|
'class': 'logging.StreamHandler', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
'file': { |
||||
|
'class': 'logging.FileHandler', |
||||
|
'filename': 'Logger.log', |
||||
|
'formatter': 'standard', |
||||
|
'level': logging.INFO, |
||||
|
}, |
||||
|
}, |
||||
|
'loggers': { |
||||
|
'': { |
||||
|
'handlers': ['console', 'file'], |
||||
|
'level': logging.INFO, |
||||
|
'propagate': True, |
||||
|
}, |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
logging.config.dictConfig(log_config) |
||||
|
|
||||
|
logger = logging.getLogger("checkCompanyName") |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b-instruct", |
||||
|
'model_server': 'DashScope', # base_url, also known as api_base |
||||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
) |
||||
|
|
||||
|
|
||||
|
# 记录程序开始的时间戳 |
||||
|
def getOutlineLevel(inputXml): |
||||
|
""" |
||||
|
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
||||
|
参数 inputXml |
||||
|
返回 number |
||||
|
""" |
||||
|
start_index = inputXml.find('<w:outlineLvl') |
||||
|
end_index = inputXml.find('>', start_index) |
||||
|
number = inputXml[start_index:end_index + 1] |
||||
|
number = re.search("\d+", number).group() |
||||
|
return number |
||||
|
|
||||
|
|
||||
|
def isTitle(paragraph): |
||||
|
""" |
||||
|
功能 判断该段落是否设置了大纲等级 |
||||
|
参数 paragraph:段落 |
||||
|
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
||||
|
""" |
||||
|
# 如果是空行,直接返回None |
||||
|
if paragraph.text.strip() == '': |
||||
|
return None |
||||
|
|
||||
|
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
||||
|
paragraphXml = paragraph._p.xml |
||||
|
if paragraphXml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(paragraphXml) |
||||
|
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
||||
|
targetStyle = paragraph.style |
||||
|
while targetStyle is not None: |
||||
|
# 如果在该级style中找到了大纲级别,返回 |
||||
|
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(targetStyle.element.xml) |
||||
|
else: |
||||
|
targetStyle = targetStyle.base_style |
||||
|
# 如果在段落、样式里都没有找到大纲级别,返回None |
||||
|
return None |
||||
|
|
||||
|
#获取文档中 详细设计方案 章节的所有内容 |
||||
|
def getDocxToTitleName(docxPath): |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
level = isTitle(paragraph) |
||||
|
if level=="0": |
||||
|
words.append(text) |
||||
|
return words |
||||
|
|
||||
|
def checkTitleName(filename): |
||||
|
|
||||
|
yield '文档结构检查----启动中' |
||||
|
with open("ce模板.txt", "r",encoding='utf-8') as f: |
||||
|
gettext = f.readlines() |
||||
|
count=0 |
||||
|
reserr = [] |
||||
|
try: |
||||
|
word = getDocxToTitleName(filename) |
||||
|
except Exception as e: |
||||
|
print(e) |
||||
|
yield "文档无法打开,请检查文档内容" |
||||
|
return |
||||
|
for text in gettext: |
||||
|
count+=1 |
||||
|
prompt = f''' |
||||
|
\n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 |
||||
|
''' |
||||
|
xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" |
||||
|
yield f"文档结构检查----结构分析中{count}/{len(gettext)}" |
||||
|
strword = "\n".join(word)+prompt+xushang |
||||
|
# print(strword) |
||||
|
messages = [{'role': 'user', 'content': [{'text':strword}]}] |
||||
|
runList = [] |
||||
|
cishu = 0 |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
# print(rsp) |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
print(parsed_data) |
||||
|
if(parsed_data["answer"]=="不存在"): |
||||
|
reserr.append(text) |
||||
|
resInfo="文档结构存在异常:<br>" |
||||
|
if(len(reserr)>0): |
||||
|
for i in reserr: |
||||
|
resInfo+="**"+i.replace('\n','')+"**<br>" |
||||
|
logger.info(resInfo) |
||||
|
yield resInfo |
||||
|
else: |
||||
|
yield "文档结构未发现异常" |
@ -0,0 +1,176 @@ |
|||||
|
from docx import Document |
||||
|
from pprint import pprint |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import re |
||||
|
import json_repair |
||||
|
import math |
||||
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
||||
|
from docx.opc.oxml import parse_xml |
||||
|
def load_from_xml_v2(baseURI, rels_item_xml): |
||||
|
""" |
||||
|
Return |_SerializedRelationships| instance loaded with the |
||||
|
relationships contained in *rels_item_xml*. Returns an empty |
||||
|
collection if *rels_item_xml* is |None|. |
||||
|
""" |
||||
|
srels = _SerializedRelationships() |
||||
|
if rels_item_xml is not None: |
||||
|
rels_elm = parse_xml(rels_item_xml) |
||||
|
for rel_elm in rels_elm.Relationship_lst: |
||||
|
if rel_elm.target_ref in ('../NULL', 'NULL'): |
||||
|
continue |
||||
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
||||
|
return srels |
||||
|
|
||||
|
|
||||
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b-instruct", |
||||
|
'model_server': 'DashScope', # base_url, also known as api_base |
||||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
) |
||||
|
|
||||
|
|
||||
|
# 记录程序开始的时间戳 |
||||
|
def getOutlineLevel(inputXml): |
||||
|
""" |
||||
|
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
||||
|
参数 inputXml |
||||
|
返回 number |
||||
|
""" |
||||
|
start_index = inputXml.find('<w:outlineLvl') |
||||
|
end_index = inputXml.find('>', start_index) |
||||
|
number = inputXml[start_index:end_index + 1] |
||||
|
number = re.search("\d+", number).group() |
||||
|
return number |
||||
|
|
||||
|
|
||||
|
def isTitle(paragraph): |
||||
|
""" |
||||
|
功能 判断该段落是否设置了大纲等级 |
||||
|
参数 paragraph:段落 |
||||
|
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
||||
|
""" |
||||
|
# 如果是空行,直接返回None |
||||
|
if paragraph.text.strip() == '': |
||||
|
return None |
||||
|
|
||||
|
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
||||
|
paragraphXml = paragraph._p.xml |
||||
|
if paragraphXml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(paragraphXml) |
||||
|
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
||||
|
targetStyle = paragraph.style |
||||
|
while targetStyle is not None: |
||||
|
# 如果在该级style中找到了大纲级别,返回 |
||||
|
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(targetStyle.element.xml) |
||||
|
else: |
||||
|
targetStyle = targetStyle.base_style |
||||
|
# 如果在段落、样式里都没有找到大纲级别,返回None |
||||
|
return None |
||||
|
|
||||
|
#获取文档中 详细设计方案 章节的所有内容 |
||||
|
def getDocxToTitleName(docxPath): |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
level = isTitle(paragraph) |
||||
|
if level=="0": |
||||
|
words.append(text) |
||||
|
return words |
||||
|
|
||||
|
def checkTitleName(filename): |
||||
|
prompt = f''' |
||||
|
\n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 |
||||
|
''' |
||||
|
xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" |
||||
|
yield f"文档结构检查----结构分析中{count}/{len(gettext)}" |
||||
|
strword = "\n".join(word) + prompt + xushang |
||||
|
# print(strword) |
||||
|
messages = [{'role': 'user', 'content': [{'text': strword}]}] |
||||
|
runList = [] |
||||
|
cishu = 0 |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
# print(rsp) |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
print(parsed_data) |
||||
|
# yield '文档结构检查----启动中' |
||||
|
# with open("ce模板.txt", "r",encoding='utf-8') as f: |
||||
|
# gettext = f.readlines() |
||||
|
# count=0 |
||||
|
# reserr = [] |
||||
|
# try: |
||||
|
# word = getDocxToTitleName(filename) |
||||
|
# except Exception as e: |
||||
|
# print(e) |
||||
|
# yield "文档无法打开,请检查文档内容" |
||||
|
# return |
||||
|
# for text in gettext: |
||||
|
# count+=1 |
||||
|
# prompt = f''' |
||||
|
# \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 |
||||
|
# ''' |
||||
|
# xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" |
||||
|
# yield f"文档结构检查----结构分析中{count}/{len(gettext)}" |
||||
|
# strword = "\n".join(word)+prompt+xushang |
||||
|
# # print(strword) |
||||
|
# messages = [{'role': 'user', 'content': [{'text':strword}]}] |
||||
|
# runList = [] |
||||
|
# cishu = 0 |
||||
|
# for rsp in bot.run(messages): |
||||
|
# runList.append(rsp) |
||||
|
# # print(rsp) |
||||
|
# data = runList[len(runList) - 1][0]["content"] |
||||
|
# parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
# print(parsed_data) |
||||
|
# if(parsed_data["answer"]=="不存在"): |
||||
|
# reserr.append(text) |
||||
|
# resInfo="文档结构存在异常:<br>" |
||||
|
# if(len(reserr)>0): |
||||
|
# for i in reserr: |
||||
|
# resInfo+=f"**{i}**<br>" |
||||
|
# yield resInfo |
||||
|
# else: |
||||
|
# yield "文档结构未发现异常" |
||||
|
|
||||
|
|
||||
|
import logging |
||||
|
|
||||
|
# 创建一个记录器 |
||||
|
logger = logging.getLogger('my_logger') |
||||
|
logger.setLevel(logging.DEBUG) |
||||
|
|
||||
|
# 创建一个处理器 |
||||
|
ch = logging.StreamHandler() |
||||
|
ch.setLevel(logging.DEBUG) |
||||
|
|
||||
|
# 创建一个格式化器并将其添加到处理器中 |
||||
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
||||
|
ch.setFormatter(formatter) |
||||
|
|
||||
|
# 将处理器添加到记录器中 |
||||
|
logger.addHandler(ch) |
||||
|
try: |
||||
|
# 记录一些日志消息 |
||||
|
logger.debug('这是一个调试消息') |
||||
|
logger.info('这是一个信息消息') |
||||
|
logger.warning('这是一个警告消息') |
||||
|
logger.error('这是一个错误消息') |
||||
|
logger.critical('这是一个致命错误消息') |
||||
|
except Exception as e: |
||||
|
logger.warning(e) |
@ -0,0 +1,712 @@ |
|||||
|
""" |
||||
|
This module will parse the JSON file following the BNF definition: |
||||
|
|
||||
|
<json> ::= <container> |
||||
|
|
||||
|
<primitive> ::= <number> | <string> | <boolean> |
||||
|
; Where: |
||||
|
; <number> is a valid real number expressed in one of a number of given formats |
||||
|
; <string> is a string of valid characters enclosed in quotes |
||||
|
; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
||||
|
|
||||
|
<container> ::= <object> | <array> |
||||
|
<array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
||||
|
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
||||
|
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value |
||||
|
|
||||
|
If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string: |
||||
|
- Add the missing parentheses if the parser believes that the array or object should be closed |
||||
|
- Quote strings or add missing single quotes |
||||
|
- Adjust whitespaces and remove line breaks |
||||
|
|
||||
|
All supported use cases are in the unit tests |
||||
|
""" |
||||
|
|
||||
|
import os |
||||
|
import json |
||||
|
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal |
||||
|
|
||||
|
|
||||
|
class StringFileWrapper: |
||||
|
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling |
||||
|
def __init__(self, fd: TextIO) -> None: |
||||
|
self.fd = fd |
||||
|
self.length: int = 0 |
||||
|
|
||||
|
def __getitem__(self, index: Union[int, slice]) -> str: |
||||
|
if isinstance(index, slice): |
||||
|
self.fd.seek(index.start) |
||||
|
value = self.fd.read(index.stop - index.start) |
||||
|
self.fd.seek(index.start) |
||||
|
return value |
||||
|
else: |
||||
|
self.fd.seek(index) |
||||
|
return self.fd.read(1) |
||||
|
|
||||
|
def __len__(self) -> int: |
||||
|
if self.length < 1: |
||||
|
current_position = self.fd.tell() |
||||
|
self.fd.seek(0, os.SEEK_END) |
||||
|
self.length = self.fd.tell() |
||||
|
self.fd.seek(current_position) |
||||
|
return self.length |
||||
|
|
||||
|
|
||||
|
class LoggerConfig: |
||||
|
# This is a type class to simplify the declaration |
||||
|
def __init__(self, log_level: Optional[str]): |
||||
|
self.log: List[Dict[str, str]] = [] |
||||
|
self.window: int = 10 |
||||
|
self.log_level: str = log_level if log_level else "none" |
||||
|
|
||||
|
|
||||
|
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None] |
||||
|
|
||||
|
|
||||
|
class JSONParser: |
||||
|
def __init__( |
||||
|
self, |
||||
|
json_str: Union[str, StringFileWrapper], |
||||
|
json_fd: Optional[TextIO], |
||||
|
logging: Optional[bool], |
||||
|
) -> None: |
||||
|
# The string to parse |
||||
|
self.json_str = json_str |
||||
|
# Alternatively, the file description with a json file in it |
||||
|
if json_fd: |
||||
|
# This is a trick we do to treat the file wrapper as an array |
||||
|
self.json_str = StringFileWrapper(json_fd) |
||||
|
# Index is our iterator that will keep track of which character we are looking at right now |
||||
|
self.index: int = 0 |
||||
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value |
||||
|
self.context: list[str] = [] |
||||
|
# Use this to log the activity, but only if logging is active |
||||
|
self.logger = LoggerConfig(log_level="info" if logging else None) |
||||
|
|
||||
|
def parse( |
||||
|
self, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
json = self.parse_json() |
||||
|
if self.index < len(self.json_str): |
||||
|
self.log( |
||||
|
"The parser returned early, checking if there's more json elements", |
||||
|
"info", |
||||
|
) |
||||
|
json = [json] |
||||
|
last_index = self.index |
||||
|
while self.index < len(self.json_str): |
||||
|
j = self.parse_json() |
||||
|
if j != "": |
||||
|
json.append(j) |
||||
|
if self.index == last_index: |
||||
|
self.index += 1 |
||||
|
last_index = self.index |
||||
|
# If nothing extra was found, don't return an array |
||||
|
if len(json) == 1: |
||||
|
self.log( |
||||
|
"There were no more elements, returning the element without the array", |
||||
|
"info", |
||||
|
) |
||||
|
json = json[0] |
||||
|
if self.logger.log_level == "none": |
||||
|
return json |
||||
|
else: |
||||
|
return json, self.logger.log |
||||
|
|
||||
|
def parse_json( |
||||
|
self, |
||||
|
) -> JSONReturnType: |
||||
|
while True: |
||||
|
char = self.get_char_at() |
||||
|
# This parser will ignore any basic element (string or number) that is not inside an array or object |
||||
|
is_in_context = len(self.context) > 0 |
||||
|
# False means that we are at the end of the string provided |
||||
|
if char is False: |
||||
|
return "" |
||||
|
# <object> starts with '{' |
||||
|
elif char == "{": |
||||
|
self.index += 1 |
||||
|
return self.parse_object() |
||||
|
# <array> starts with '[' |
||||
|
elif char == "[": |
||||
|
self.index += 1 |
||||
|
return self.parse_array() |
||||
|
# there can be an edge case in which a key is empty and at the end of an object |
||||
|
# like "key": }. We return an empty string here to close the object properly |
||||
|
elif char == "}": |
||||
|
self.log( |
||||
|
"At the end of an object we found a key with missing value, skipping", |
||||
|
"info", |
||||
|
) |
||||
|
return "" |
||||
|
# <string> starts with a quote |
||||
|
elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()): |
||||
|
return self.parse_string() |
||||
|
# <number> starts with [0-9] or minus |
||||
|
elif is_in_context and (char.isdigit() or char == "-" or char == "."): |
||||
|
return self.parse_number() |
||||
|
# If everything else fails, we just ignore and move on |
||||
|
else: |
||||
|
self.index += 1 |
||||
|
|
||||
|
def parse_object(self) -> Dict[str, Any]: |
||||
|
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
||||
|
obj = {} |
||||
|
# Stop when you either find the closing parentheses or you have iterated over the entire string |
||||
|
while (self.get_char_at() or "}") != "}": |
||||
|
# This is what we expect to find: |
||||
|
# <member> ::= <string> ': ' <json> |
||||
|
|
||||
|
# Skip filler whitespaces |
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on |
||||
|
if (self.get_char_at() or "") == ":": |
||||
|
self.log( |
||||
|
"While parsing an object we found a : before a key, ignoring", |
||||
|
"info", |
||||
|
) |
||||
|
self.index += 1 |
||||
|
|
||||
|
# We are now searching for they string key |
||||
|
# Context is used in the string parser to manage the lack of quotes |
||||
|
self.set_context("object_key") |
||||
|
|
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# <member> starts with a <string> |
||||
|
key = "" |
||||
|
while self.get_char_at(): |
||||
|
key = str(self.parse_string()) |
||||
|
|
||||
|
if key != "" or (key == "" and self.get_char_at() == ":"): |
||||
|
# If the string is empty but there is a object divider, we are done here |
||||
|
break |
||||
|
|
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# We reached the end here |
||||
|
if (self.get_char_at() or "}") == "}": |
||||
|
continue |
||||
|
|
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# An extreme case of missing ":" after a key |
||||
|
if (self.get_char_at() or "") != ":": |
||||
|
self.log( |
||||
|
"While parsing an object we missed a : after a key", |
||||
|
"info", |
||||
|
) |
||||
|
|
||||
|
self.index += 1 |
||||
|
self.reset_context() |
||||
|
self.set_context("object_value") |
||||
|
# The value can be any valid json |
||||
|
value = self.parse_json() |
||||
|
|
||||
|
# Reset context since our job is done |
||||
|
self.reset_context() |
||||
|
obj[key] = value |
||||
|
|
||||
|
if (self.get_char_at() or "") in [",", "'", '"']: |
||||
|
self.index += 1 |
||||
|
|
||||
|
# Remove trailing spaces |
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
self.index += 1 |
||||
|
return obj |
||||
|
|
||||
|
def parse_array(self) -> List[Any]: |
||||
|
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
||||
|
arr = [] |
||||
|
self.set_context("array") |
||||
|
# Stop when you either find the closing parentheses or you have iterated over the entire string |
||||
|
while (self.get_char_at() or "]") != "]": |
||||
|
self.skip_whitespaces_at() |
||||
|
value = self.parse_json() |
||||
|
|
||||
|
# It is possible that parse_json() returns nothing valid, so we stop |
||||
|
if value == "": |
||||
|
break |
||||
|
|
||||
|
if value == "..." and self.get_char_at(-1) == ".": |
||||
|
self.log( |
||||
|
"While parsing an array, found a stray '...'; ignoring it", "info" |
||||
|
) |
||||
|
else: |
||||
|
arr.append(value) |
||||
|
|
||||
|
# skip over whitespace after a value but before closing ] |
||||
|
char = self.get_char_at() |
||||
|
while char and (char.isspace() or char == ","): |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
|
||||
|
# Especially at the end of an LLM generated json you might miss the last "]" |
||||
|
char = self.get_char_at() |
||||
|
if char and char != "]": |
||||
|
self.log( |
||||
|
"While parsing an array we missed the closing ], adding it back", "info" |
||||
|
) |
||||
|
self.index -= 1 |
||||
|
|
||||
|
self.index += 1 |
||||
|
self.reset_context() |
||||
|
return arr |
||||
|
|
||||
|
def parse_string(self) -> Union[str, bool, None]: |
||||
|
# <string> is a string of valid characters enclosed in quotes |
||||
|
# i.e. { name: "John" } |
||||
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here |
||||
|
|
||||
|
# Flag to manage corner cases related to missing starting quote |
||||
|
missing_quotes = False |
||||
|
doubled_quotes = False |
||||
|
lstring_delimiter = rstring_delimiter = '"' |
||||
|
|
||||
|
char = self.get_char_at() |
||||
|
# A valid string can only start with a valid quote or, in our case, with a literal |
||||
|
while char and char not in ['"', "'", "“"] and not char.isalnum(): |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
|
||||
|
if not char: |
||||
|
# This is an empty string |
||||
|
return "" |
||||
|
|
||||
|
# Ensuring we use the right delimiter |
||||
|
if char == "'": |
||||
|
lstring_delimiter = rstring_delimiter = "'" |
||||
|
elif char == "“": |
||||
|
lstring_delimiter = "“" |
||||
|
rstring_delimiter = "”" |
||||
|
elif char.isalnum(): |
||||
|
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid |
||||
|
# But remember, object keys are only of type string |
||||
|
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key": |
||||
|
value = self.parse_boolean_or_null() |
||||
|
if value != "": |
||||
|
return value |
||||
|
self.log( |
||||
|
"While parsing a string, we found a literal instead of a quote", |
||||
|
"info", |
||||
|
) |
||||
|
self.log( |
||||
|
"While parsing a string, we found no starting quote. Will add the quote back", |
||||
|
"info", |
||||
|
) |
||||
|
missing_quotes = True |
||||
|
|
||||
|
if not missing_quotes: |
||||
|
self.index += 1 |
||||
|
|
||||
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop |
||||
|
if self.get_char_at() == lstring_delimiter: |
||||
|
# If it's an empty key, this was easy |
||||
|
if self.get_context() == "object_key" and self.get_char_at(1) == ":": |
||||
|
self.index += 1 |
||||
|
return "" |
||||
|
# Find the next delimiter |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# Now check that the next character is also a delimiter to ensure that we have ""....."" |
||||
|
# In that case we ignore this rstring delimiter |
||||
|
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter: |
||||
|
self.log( |
||||
|
"While parsing a string, we found a valid starting doubled quote, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
doubled_quotes = True |
||||
|
self.index += 1 |
||||
|
else: |
||||
|
# Ok this is not a doubled quote, check if this is an empty string or not |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c not in [",", "]", "}"]: |
||||
|
self.log( |
||||
|
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote", |
||||
|
"info", |
||||
|
) |
||||
|
self.index += 1 |
||||
|
|
||||
|
# Initialize our return value |
||||
|
string_acc = "" |
||||
|
|
||||
|
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object |
||||
|
# In that case we need to use the ":|,|}" characters as terminators of the string |
||||
|
# So this will stop if: |
||||
|
# * It finds a closing quote |
||||
|
# * It iterated over the entire sequence |
||||
|
# * If we are fixing missing quotes in an object, when it finds the special terminators |
||||
|
char = self.get_char_at() |
||||
|
while char and char != rstring_delimiter: |
||||
|
if missing_quotes: |
||||
|
if self.get_context() == "object_key" and ( |
||||
|
char == ":" or char.isspace() |
||||
|
): |
||||
|
self.log( |
||||
|
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here", |
||||
|
"info", |
||||
|
) |
||||
|
break |
||||
|
elif self.get_context() == "object_value" and char in [",", "}"]: |
||||
|
rstring_delimiter_missing = True |
||||
|
# check if this is a case in which the closing comma is NOT missing instead |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c and next_c in [",", "}"]: |
||||
|
rstring_delimiter_missing = False |
||||
|
if rstring_delimiter_missing: |
||||
|
self.log( |
||||
|
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here", |
||||
|
"info", |
||||
|
) |
||||
|
break |
||||
|
string_acc += char |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
if char and len(string_acc) > 0 and string_acc[-1] == "\\": |
||||
|
# This is a special case, if people use real strings this might happen |
||||
|
self.log("Found a stray escape sequence, normalizing it", "info") |
||||
|
string_acc = string_acc[:-1] |
||||
|
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]: |
||||
|
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"} |
||||
|
string_acc += escape_seqs.get(char, char) or char |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here |
||||
|
if char == rstring_delimiter: |
||||
|
# Special case here, in case of double quotes one after another |
||||
|
if doubled_quotes and self.get_char_at(1) == rstring_delimiter: |
||||
|
self.log( |
||||
|
"While parsing a string, we found a doubled quote, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
self.index += 1 |
||||
|
elif missing_quotes and self.get_context() == "object_value": |
||||
|
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c not in [ |
||||
|
rstring_delimiter, |
||||
|
lstring_delimiter, |
||||
|
]: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c: |
||||
|
# We found a quote, now let's make sure there's a ":" following |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c and next_c == ":": |
||||
|
# Reset the cursor |
||||
|
self.index -= 1 |
||||
|
char = self.get_char_at() |
||||
|
self.log( |
||||
|
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.", |
||||
|
"info", |
||||
|
) |
||||
|
break |
||||
|
else: |
||||
|
# Check if eventually there is a rstring delimiter, otherwise we bail |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
check_comma_in_object_value = True |
||||
|
while next_c and next_c not in [ |
||||
|
rstring_delimiter, |
||||
|
lstring_delimiter, |
||||
|
]: |
||||
|
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas |
||||
|
# This is because the routine after will make sure to correct any bad guess and this solves a corner case |
||||
|
if check_comma_in_object_value and next_c.isalpha(): |
||||
|
check_comma_in_object_value = False |
||||
|
# If we are in an object context, let's check for the right delimiters |
||||
|
if ( |
||||
|
("object_key" in self.context and next_c in [":", "}"]) |
||||
|
or ("object_value" in self.context and next_c == "}") |
||||
|
or ("array" in self.context and next_c in ["]", ","]) |
||||
|
or ( |
||||
|
check_comma_in_object_value |
||||
|
and self.get_context() == "object_value" |
||||
|
and next_c == "," |
||||
|
) |
||||
|
): |
||||
|
break |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string |
||||
|
if next_c == "," and self.get_context() == "object_value": |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c == "}": |
||||
|
# OK this is valid then |
||||
|
self.log( |
||||
|
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
string_acc += str(char) |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
elif next_c == rstring_delimiter: |
||||
|
if self.get_context() == "object_value": |
||||
|
# But this might not be it! This could be just a missing comma |
||||
|
# We found a delimiter and we need to check if this is a key |
||||
|
# so find a rstring_delimiter and a colon after |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != ":": |
||||
|
if next_c in [ |
||||
|
lstring_delimiter, |
||||
|
rstring_delimiter, |
||||
|
",", |
||||
|
]: |
||||
|
break |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# Only if we fail to find a ':' then we know this is misplaced quote |
||||
|
if next_c != ":": |
||||
|
self.log( |
||||
|
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
string_acc += str(char) |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
|
||||
|
if ( |
||||
|
char |
||||
|
and missing_quotes |
||||
|
and self.get_context() == "object_key" |
||||
|
and char.isspace() |
||||
|
): |
||||
|
self.log( |
||||
|
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value", |
||||
|
"info", |
||||
|
) |
||||
|
self.skip_whitespaces_at() |
||||
|
if self.get_char_at() not in [":", ","]: |
||||
|
return "" |
||||
|
|
||||
|
# A fallout of the previous special case in the while loop, |
||||
|
# we need to update the index only if we had a closing quote |
||||
|
if char != rstring_delimiter: |
||||
|
self.log( |
||||
|
"While parsing a string, we missed the closing quote, ignoring", |
||||
|
"info", |
||||
|
) |
||||
|
else: |
||||
|
self.index += 1 |
||||
|
|
||||
|
return string_acc.rstrip() |
||||
|
|
||||
|
def parse_number(self) -> Union[float, int, str, JSONReturnType]: |
||||
|
# <number> is a valid real number expressed in one of a number of given formats |
||||
|
number_str = "" |
||||
|
number_chars = set("0123456789-.eE/,") |
||||
|
char = self.get_char_at() |
||||
|
is_array = self.get_context() == "array" |
||||
|
while char and char in number_chars and (char != "," or not is_array): |
||||
|
number_str += char |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
if len(number_str) > 1 and number_str[-1] in "-eE/,": |
||||
|
# The number ends with a non valid character for a number/currency, rolling back one |
||||
|
number_str = number_str[:-1] |
||||
|
self.index -= 1 |
||||
|
try: |
||||
|
if "," in number_str: |
||||
|
return str(number_str) |
||||
|
if "." in number_str or "e" in number_str or "E" in number_str: |
||||
|
return float(number_str) |
||||
|
elif number_str == "-": |
||||
|
# If there is a stray "-" this will throw an exception, throw away this character |
||||
|
return self.parse_json() |
||||
|
else: |
||||
|
return int(number_str) |
||||
|
except ValueError: |
||||
|
return number_str |
||||
|
|
||||
|
def parse_boolean_or_null(self) -> Union[bool, str, None]: |
||||
|
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
||||
|
starting_index = self.index |
||||
|
char = (self.get_char_at() or "").lower() |
||||
|
value: Optional[Tuple[str, Optional[bool]]] |
||||
|
if char == "t": |
||||
|
value = ("true", True) |
||||
|
elif char == "f": |
||||
|
value = ("false", False) |
||||
|
elif char == "n": |
||||
|
value = ("null", None) |
||||
|
|
||||
|
if value: |
||||
|
i = 0 |
||||
|
while char and i < len(value[0]) and char == value[0][i]: |
||||
|
i += 1 |
||||
|
self.index += 1 |
||||
|
char = (self.get_char_at() or "").lower() |
||||
|
if i == len(value[0]): |
||||
|
return value[1] |
||||
|
|
||||
|
# If nothing works reset the index before returning |
||||
|
self.index = starting_index |
||||
|
return "" |
||||
|
|
||||
|
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]: |
||||
|
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True |
||||
|
try: |
||||
|
return self.json_str[self.index + count] |
||||
|
except IndexError: |
||||
|
return False |
||||
|
|
||||
|
def skip_whitespaces_at(self) -> None: |
||||
|
""" |
||||
|
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise |
||||
|
""" |
||||
|
try: |
||||
|
char = self.json_str[self.index] |
||||
|
except IndexError: |
||||
|
return |
||||
|
while char.isspace(): |
||||
|
self.index += 1 |
||||
|
try: |
||||
|
char = self.json_str[self.index] |
||||
|
except IndexError: |
||||
|
return |
||||
|
|
||||
|
def set_context(self, value: str) -> None: |
||||
|
# If a value is provided update the context variable and save in stack |
||||
|
if value: |
||||
|
self.context.append(value) |
||||
|
|
||||
|
def reset_context(self) -> None: |
||||
|
self.context.pop() |
||||
|
|
||||
|
def get_context(self) -> str: |
||||
|
return self.context[-1] |
||||
|
|
||||
|
def log(self, text: str, level: str) -> None: |
||||
|
if level == self.logger.log_level: |
||||
|
context = "" |
||||
|
start = max(self.index - self.logger.window, 0) |
||||
|
end = min(self.index + self.logger.window, len(self.json_str)) |
||||
|
context = self.json_str[start:end] |
||||
|
self.logger.log.append( |
||||
|
{ |
||||
|
"text": text, |
||||
|
"context": context, |
||||
|
} |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def repair_json( |
||||
|
json_str: str = "", |
||||
|
return_objects: bool = False, |
||||
|
skip_json_loads: bool = False, |
||||
|
logging: bool = False, |
||||
|
json_fd: Optional[TextIO] = None, |
||||
|
ensure_ascii: bool = True, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it. |
||||
|
It will return the fixed string by default. |
||||
|
When `return_objects=True` is passed, it will return the decoded data structure instead. |
||||
|
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function |
||||
|
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions |
||||
|
""" |
||||
|
parser = JSONParser(json_str, json_fd, logging) |
||||
|
if skip_json_loads: |
||||
|
parsed_json = parser.parse() |
||||
|
else: |
||||
|
try: |
||||
|
if json_fd: |
||||
|
parsed_json = json.load(json_fd) |
||||
|
else: |
||||
|
parsed_json = json.loads(json_str) |
||||
|
except json.JSONDecodeError: |
||||
|
parsed_json = parser.parse() |
||||
|
# It's useful to return the actual object instead of the json string, |
||||
|
# it allows this lib to be a replacement of the json library |
||||
|
if return_objects or logging: |
||||
|
return parsed_json |
||||
|
return json.dumps(parsed_json, ensure_ascii=ensure_ascii) |
||||
|
|
||||
|
|
||||
|
def loads( |
||||
|
json_str: str, |
||||
|
skip_json_loads: bool = False, |
||||
|
logging: bool = False, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
This function works like `json.loads()` except that it will fix your JSON in the process. |
||||
|
It is a wrapper around the `repair_json()` function with `return_objects=True`. |
||||
|
""" |
||||
|
return repair_json( |
||||
|
json_str=json_str, |
||||
|
return_objects=True, |
||||
|
skip_json_loads=skip_json_loads, |
||||
|
logging=logging, |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def load( |
||||
|
fd: TextIO, skip_json_loads: bool = False, logging: bool = False |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
This function works like `json.load()` except that it will fix your JSON in the process. |
||||
|
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`. |
||||
|
""" |
||||
|
return repair_json( |
||||
|
json_fd=fd, |
||||
|
return_objects=True, |
||||
|
skip_json_loads=skip_json_loads, |
||||
|
logging=logging, |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def from_file( |
||||
|
filename: str, |
||||
|
skip_json_loads: bool = False, |
||||
|
logging: bool = False, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
This function is a wrapper around `load()` so you can pass the filename as string |
||||
|
""" |
||||
|
fd = open(filename) |
||||
|
jsonobj = load(fd, skip_json_loads, logging) |
||||
|
fd.close() |
||||
|
|
||||
|
return jsonobj |
@ -0,0 +1,161 @@ |
|||||
|
from flask import Flask, request, jsonify,Response |
||||
|
import os |
||||
|
from checkPlaceName import checkPlaceName |
||||
|
from checkRepeatText import checkRepeatText |
||||
|
from checkCompanyName import checkCompanyName |
||||
|
from checkDocumentError import getDocumentError |
||||
|
from checkTitleName import checkTitleName |
||||
|
from flask_cors import CORS |
||||
|
import qwen_agenttext |
||||
|
app = Flask(__name__) |
||||
|
cros = CORS(app) |
||||
|
UPLOAD_FOLDER = 'uploads' |
||||
|
usableTag=[0,0,0,0,0,0,0,0] |
||||
|
if not os.path.exists(UPLOAD_FOLDER): |
||||
|
os.makedirs(UPLOAD_FOLDER) |
||||
|
@app.route('/upload', methods=['POST']) |
||||
|
def upload_file(): |
||||
|
if 'file' not in request.files: |
||||
|
return jsonify({"error": "No file part"}), 400 |
||||
|
file = request.files['file'] |
||||
|
if file.filename == '': |
||||
|
return jsonify({"error": "No selected file"}), 400 |
||||
|
if file: |
||||
|
filename = file.filename |
||||
|
file.save(os.path.join(UPLOAD_FOLDER,filename)) |
||||
|
return jsonify({"message": "File uploaded successfully"}), 200 |
||||
|
@app.route('/stream' ,methods=["GET", "POST"]) |
||||
|
def stream_numbers(): |
||||
|
context= request.args.get('context') |
||||
|
# def generate_numbers(): |
||||
|
# event_id=0 |
||||
|
# for number in range(1, 10): |
||||
|
# json_data = json.dumps({"number": number}) |
||||
|
# print(json_data) |
||||
|
# event_id += 1 |
||||
|
# yield f"id: {event_id}\n" |
||||
|
# yield f"event: time-update\n" |
||||
|
# yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 |
||||
|
# time.sleep(0.5) # 为了演示,加入短暂延迟 |
||||
|
# json_data = json.dumps({"number": "done"}) |
||||
|
# yield f"id: {1}\n" |
||||
|
# yield f"event: time-update\n" |
||||
|
# yield f"data: {json_data}\n\n" # 发送完成信号 |
||||
|
|
||||
|
headers = { |
||||
|
"Content-Type": "text/event-stream", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"X-Accel-Buffering": "no", |
||||
|
"Access-Control-Allow-Origin": "*", |
||||
|
"Access-Control-Allow-Methods": "GET,POST", |
||||
|
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
||||
|
} |
||||
|
return Response(qwen_agenttext.getxinx(context),headers=headers) |
||||
|
@app.route('/sse/checkRepeatText', methods=['GET']) |
||||
|
def checkRepeatTextWeb(): |
||||
|
filename = request.args.get('filename') |
||||
|
|
||||
|
def generate_checkRepeatText(filename): |
||||
|
id=0 |
||||
|
try: |
||||
|
for i in checkRepeatText(filename): |
||||
|
yield f"id: {id+1}\n" |
||||
|
yield f"event: checkRepeatText\n" |
||||
|
yield f"data: {i}\n\n" # 发送完成信号 |
||||
|
except Exception as e: |
||||
|
yield f"id: {id+1}\n" |
||||
|
yield f"event: checkRepeatText\n" |
||||
|
yield f"data: **程序出现异常**\n\n" # 发送完成信号 |
||||
|
headers = { |
||||
|
"Content-Type": "text/event-stream", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"X-Accel-Buffering": "no", |
||||
|
"Access-Control-Allow-Origin": "*", |
||||
|
"Access-Control-Allow-Methods": "GET,POST", |
||||
|
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
||||
|
} |
||||
|
return Response(generate_checkRepeatText(filename), headers=headers) |
||||
|
|
||||
|
|
||||
|
@app.route('/sse/checkPlaceName', methods=['GET']) |
||||
|
def checkPlaceNameWebSse(): |
||||
|
filename = request.args.get('filename') |
||||
|
|
||||
|
def generate_checkPlaceName(filename): |
||||
|
id=0 |
||||
|
for i in checkPlaceName(filename): |
||||
|
yield f"id: {id+1}\n" |
||||
|
yield f"event: checkPlaceName\n" |
||||
|
yield f"data: {i}\n\n" # 发送完成信号 |
||||
|
headers = { |
||||
|
"Content-Type": "text/event-stream", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"X-Accel-Buffering": "no", |
||||
|
"Access-Control-Allow-Origin": "*", |
||||
|
"Access-Control-Allow-Methods": "GET,POST", |
||||
|
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
||||
|
} |
||||
|
return Response(generate_checkPlaceName(filename), headers=headers) |
||||
|
@app.route('/sse/checkCompanyName', methods=['GET']) |
||||
|
def checkCompanyNameWebSse(): |
||||
|
filename = request.args.get('filename') |
||||
|
|
||||
|
def generate_checkCompanyName(filename): |
||||
|
id = 0 |
||||
|
for i in checkCompanyName(filename): |
||||
|
yield f"id: {id + 1}\n" |
||||
|
yield f"event: checkCompanyName\n" |
||||
|
yield f"data: {i}\n\n" # 发送完成信号 |
||||
|
|
||||
|
headers = { |
||||
|
"Content-Type": "text/event-stream", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"X-Accel-Buffering": "no", |
||||
|
"Access-Control-Allow-Origin": "*", |
||||
|
"Access-Control-Allow-Methods": "GET,POST", |
||||
|
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
||||
|
} |
||||
|
return Response(generate_checkCompanyName(filename), headers=headers) |
||||
|
|
||||
|
@app.route('/sse/checkDocumentErrorWeb', methods=['GET']) |
||||
|
def checkDocumentErrorWebSse(): |
||||
|
filename = request.args.get('filename') |
||||
|
|
||||
|
def generate_checkDocumentError(filename): |
||||
|
id = 0 |
||||
|
for i in getDocumentError(filename): |
||||
|
yield f"id: {id + 1}\n" |
||||
|
yield f"event: getDocumentError\n" |
||||
|
yield f"data: {i}\n\n" # 发送完成信号 |
||||
|
|
||||
|
headers = { |
||||
|
"Content-Type": "text/event-stream", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"X-Accel-Buffering": "no", |
||||
|
"Access-Control-Allow-Origin": "*", |
||||
|
"Access-Control-Allow-Methods": "GET,POST", |
||||
|
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
||||
|
} |
||||
|
return Response(generate_checkDocumentError(filename), headers=headers) |
||||
|
@app.route('/sse/checkTitleName', methods=['GET']) |
||||
|
def checkTitleNameWebSse(): |
||||
|
filename = request.args.get('filename') |
||||
|
|
||||
|
def generate_checkTitleName(filename): |
||||
|
id = 0 |
||||
|
for i in checkTitleName(filename): |
||||
|
yield f"id: {id + 1}\n" |
||||
|
yield f"event: checkTitleName\n" |
||||
|
yield f"data: {i}\n\n" # 发送完成信号 |
||||
|
|
||||
|
headers = { |
||||
|
"Content-Type": "text/event-stream", |
||||
|
"Cache-Control": "no-cache", |
||||
|
"X-Accel-Buffering": "no", |
||||
|
"Access-Control-Allow-Origin": "*", |
||||
|
"Access-Control-Allow-Methods": "GET,POST", |
||||
|
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
||||
|
} |
||||
|
return Response(generate_checkTitleName(filename), headers=headers) |
||||
|
if __name__ == '__main__': |
||||
|
app.run(host="0.0.0.0",port=80) |
@ -0,0 +1,132 @@ |
|||||
|
import pprint |
||||
|
import urllib.parse |
||||
|
import json5 |
||||
|
from qwen_agent.agents import Assistant |
||||
|
from qwen_agent.tools.base import BaseTool, register_tool |
||||
|
import requests |
||||
|
import baidusearch |
||||
|
import tqdm |
||||
|
|
||||
|
# 使用示例 |
||||
|
|
||||
|
|
||||
|
|
||||
|
# Step 1 (Optional): Add a custom tool named `my_image_gen`. |
||||
|
@register_tool('my_image_gen') |
||||
|
class MyImageGen(BaseTool): |
||||
|
# The `description` tells the agent the functionality of this tool. |
||||
|
description = 'AI painting (image generation) service, input text description, and return the image URL drawn based on text information.' |
||||
|
# The `parameters` tell the agent what input parameters the tool has. |
||||
|
parameters = [{ |
||||
|
'name': 'prompt', |
||||
|
'type': 'string', |
||||
|
'description': 'Detailed description of the desired image content, in English', |
||||
|
'required': True |
||||
|
}] |
||||
|
|
||||
|
def call(self, params: str, **kwargs) -> str: |
||||
|
# `params` are the arguments generated by the LLM agent. |
||||
|
prompt = json5.loads(params)['prompt'] |
||||
|
# 对提示词进行URL编码 |
||||
|
prompt = urllib.parse.quote(prompt) |
||||
|
# |
||||
|
return json5.dumps( |
||||
|
{'image_url': f'https://image.pollinations.ai/prompt/{prompt}'}, |
||||
|
ensure_ascii=False) |
||||
|
|
||||
|
|
||||
|
@register_tool('chaxun') |
||||
|
class MyImageGen(BaseTool): |
||||
|
# The `description` tells the agent the functionality of this tool. |
||||
|
description = '如果你不会,请使用此工具进行联网查询' |
||||
|
# The `parameters` tell the agent what input parameters the tool has. |
||||
|
parameters = [{ |
||||
|
'name': 'prompt', |
||||
|
'type': 'string', |
||||
|
'description': '请你描述需要提问的信息,以此帮助你了解更多的信息', |
||||
|
'required': True |
||||
|
}] |
||||
|
|
||||
|
def call(self, params: str, **kwargs) -> str: |
||||
|
# `params` are the arguments generated by the LLM agent. |
||||
|
prompt = json5.loads(params)['prompt'] |
||||
|
# 对提示词进行URL编码 |
||||
|
prompt = urllib.parse.quote(prompt) |
||||
|
# |
||||
|
search_tool = baidusearch.search(prompt, num_results=20) |
||||
|
print(search_tool) |
||||
|
return search_tool |
||||
|
# Step 2: Configure the LLM you are using. |
||||
|
# 这里是需要配置模型的地方。需要填写模型名字,以及model_server,即模型所在服务器名字,如果没有,也可以考虑使用api_key。 |
||||
|
llm_cfg = { |
||||
|
# Use the model service provided by DashScope: |
||||
|
# model:模型名称 |
||||
|
# model_server:模型所在的服务器 |
||||
|
# api_key: 所使用到的api-key,可以显示的设置,也可以从环境变量中获取 |
||||
|
|
||||
|
'model':"qwen2-72b-instruct", |
||||
|
'model_server': 'DashScope', # base_url, also known as api_base |
||||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
# 'api_key': 'YOUR_DASHSCOPE_API_KEY', |
||||
|
# It will use the `DASHSCOPE_API_KEY' environment variable if 'api_key' is not set here. |
||||
|
|
||||
|
# Use a model service compatible with the OpenAI API, such as vLLM or Ollama: |
||||
|
# 'model': 'Qwen1.5-7B-Chat', |
||||
|
# 'model_server': 'http://localhost:8000/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'EMPTY', |
||||
|
|
||||
|
# (Optional) LLM hyperparameters for generation: |
||||
|
# 用于调整生成参数的可选配置 |
||||
|
'generate_cfg': { |
||||
|
'top_p': 0.8 |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
# Step 3: Create an agent. Here we use the `Assistant` agent as an example, which is capable of using tools and reading files. |
||||
|
|
||||
|
# agent的提示词指令 |
||||
|
system_instruction = ''' |
||||
|
你是一个乐于助人的助手。 |
||||
|
收到用户的请求后,您应: |
||||
|
你应该进行思考,判断是否使用工具, |
||||
|
如果遇到你不会回答,请使用工具[chaxun] |
||||
|
''' |
||||
|
|
||||
|
# 工具列表,指定Assistant可以访问的工具,一个是自定义的工具,一个是代码执行器 |
||||
|
tools = ["chaxun"] # `code_interpreter` is a built-in tool for executing code. |
||||
|
# 助理可以读取的文件路径 |
||||
|
# files = ['./examples/resource/doc.pdf'] # Give the bot a PDF file to read. |
||||
|
|
||||
|
# 初始化Assistant |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
system_message=system_instruction, |
||||
|
function_list=tools, |
||||
|
# files=files |
||||
|
) |
||||
|
|
||||
|
# Step 4: Run the agent as a chatbot. |
||||
|
messages = [] # This stores the chat history. |
||||
|
def getxinx(context): |
||||
|
# For example, enter the query "draw a dog and rotate it 90 degrees". |
||||
|
# query = input('user query: ') |
||||
|
# Append the user query to the chat history. |
||||
|
messages=[({'role': 'user', 'content': context})] |
||||
|
print(messages) |
||||
|
response = [] |
||||
|
event_id = 0 |
||||
|
for rsp in bot.run(messages=messages): |
||||
|
response.append(rsp) |
||||
|
yield "请稍等.." |
||||
|
# len() |
||||
|
# for i in bot.run(messages=messages): |
||||
|
# # for number in range(1, 10): |
||||
|
# print(i) |
||||
|
# print(i[len(i)-1]['content']) |
||||
|
# event_id += 1 |
||||
|
# yield f"id: {event_id}\n" |
||||
|
# yield f"event: time-update\n" |
||||
|
# if(i[len(i)-1]['role']=='assistant'): |
||||
|
# yield "data: {}\n\n".format(str(i[len(i)-1]['content'].replace('\n\n',''))) # 每次生成一个数字就发送 |
||||
|
# else: |
||||
|
# yield f"data: \n\n" # 每次生成一个数字就发送 |
||||
|
# Streaming output. |
@ -0,0 +1,109 @@ |
|||||
|
import time |
||||
|
import json |
||||
|
import math |
||||
|
from flask import Flask,Response,request |
||||
|
from flask_sse import sse |
||||
|
from flask_cors import CORS |
||||
|
import re |
||||
|
import qwen_agenttext |
||||
|
app = Flask(__name__) |
||||
|
cros = CORS(app) |
||||
|
# SSE 推送函数 |
||||
|
import paddle; |
||||
|
paddle.device.get_available_device() |
||||
|
|
||||
|
|
||||
|
# SSE 推送路由 |
||||
|
|
||||
|
|
||||
|
# @app.route('/register', methods=["GET"]) |
||||
|
# def register(): |
||||
|
# 获取客户端标识符 |
||||
|
# client_id = str(uuid.uuid4()) |
||||
|
# |
||||
|
# # 返回 SSE 响应 |
||||
|
# return jsonify({"client_id": client_id}) |
||||
|
|
||||
|
|
||||
|
# SSE 推送路由 |
||||
|
|
||||
|
|
||||
|
# @app.route('/sse', methods=['POST']) |
||||
|
# def stream(): |
||||
|
# # 获取客户端标识符 |
||||
|
# client_id = 1 |
||||
|
# print("client_id", client_id) |
||||
|
# |
||||
|
# def aa(): |
||||
|
# # 循环发送 SSE 数据 |
||||
|
# for i in range(10): |
||||
|
# data = 'Hello, %s!' % client_id + str(i) |
||||
|
# print(data) |
||||
|
# sse.publish(data, channel=client_id, type='message') |
||||
|
# time.sleep(1) |
||||
|
# sse.publish("end", channel=client_id, type='message') |
||||
|
# |
||||
|
# # 返回 SSE 响应 |
||||
|
# response = Response(aa(), mimetype='text/event-stream') |
||||
|
# response.headers.add('Cache-Control', 'no-cache') |
||||
|
# response.headers.add('Connection', 'keep-alive') |
||||
|
# response.headers.add('X-Accel-Buffering', 'no') |
||||
|
# return response |
||||
|
# |
||||
|
# |
||||
|
# |
||||
|
# @app.route('/stream' ,methods=["GET", "POST"]) |
||||
|
# def stream_numbers(): |
||||
|
# context= request.args.get('context') |
||||
|
# |
||||
|
# |
||||
|
# headers = { |
||||
|
# "Content-Type": "text/event-stream", |
||||
|
# "Cache-Control": "no-cache", |
||||
|
# "X-Accel-Buffering": "no", |
||||
|
# "Access-Control-Allow-Origin": "*", |
||||
|
# "Access-Control-Allow-Methods": "GET,POST", |
||||
|
# "Access-Control-Allow-Headers": "x-requested-with,content-type", |
||||
|
# } |
||||
|
# return Response(generate_numbers(),headers=headers) |
||||
|
# def generate_numbers(): |
||||
|
# event_id=0 |
||||
|
# # for number in range(1, 10): |
||||
|
# # json_data = json.dumps({"number": number}) |
||||
|
# # print(json_data) |
||||
|
# # event_id += 1 |
||||
|
# # yield f"id: {event_id}\n" |
||||
|
# # yield f"event: time-update\n" |
||||
|
# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 |
||||
|
# json_data = json.dumps({"number": "done"}) |
||||
|
# yield f"id: {1}\n" |
||||
|
# yield f"event: time-update\n" |
||||
|
# yield f"data: 34568\n\n" # 发送完成信号 |
||||
|
# if __name__ == '__main__': |
||||
|
# |
||||
|
# |
||||
|
# # 读取文件内容 |
||||
|
# with open("checkPlaceName.txt", "r", encoding='utf-8') as f: |
||||
|
# gettext = f.read() |
||||
|
# batchNum=20 |
||||
|
# sentences = re.split(r'[。\n]', gettext) |
||||
|
# # 去掉空字符 |
||||
|
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
# # 计算总字符数 |
||||
|
# total_chars = len(sentences) |
||||
|
# |
||||
|
# # 计算有多少份 |
||||
|
# num_chunks = math.ceil(total_chars / batchNum) |
||||
|
# |
||||
|
# # 按batchNum字为一份进行处理 |
||||
|
# chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
||||
|
# |
||||
|
# # 打印每一份的内容 |
||||
|
# for i, chunk in enumerate(chunks): |
||||
|
# print(f"Chunk {i + 1}:") |
||||
|
# print(chunk) |
||||
|
# print("-" * 40) |
||||
|
# |
||||
|
# # 打印总份数 |
||||
|
# print(f"Total chunks: {num_chunks}") |
||||
|
# app.run(debug=True,port=80) |
After Width: | Height: | Size: 420 KiB |
After Width: | Height: | Size: 245 KiB |
After Width: | Height: | Size: 117 KiB |
After Width: | Height: | Size: 17 KiB |
After Width: | Height: | Size: 62 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 24 KiB |
After Width: | Height: | Size: 211 KiB |
After Width: | Height: | Size: 916 KiB |
After Width: | Height: | Size: 217 KiB |
After Width: | Height: | Size: 252 KiB |
After Width: | Height: | Size: 904 KiB |
@ -0,0 +1,12 @@ |
|||||
|
{ |
||||
|
"shell_port": 3199, |
||||
|
"iopub_port": 3205, |
||||
|
"stdin_port": 3200, |
||||
|
"control_port": 3201, |
||||
|
"hb_port": 3209, |
||||
|
"ip": "127.0.0.1", |
||||
|
"key": "41711130-ba4287db5e2a6e7b98444c31", |
||||
|
"transport": "tcp", |
||||
|
"signature_scheme": "hmac-sha256", |
||||
|
"kernel_name": "" |
||||
|
} |
@ -0,0 +1,12 @@ |
|||||
|
{ |
||||
|
"shell_port": 36295, |
||||
|
"iopub_port": 36301, |
||||
|
"stdin_port": 36296, |
||||
|
"control_port": 36297, |
||||
|
"hb_port": 36305, |
||||
|
"ip": "127.0.0.1", |
||||
|
"key": "0faec31a-0f91a316abd70cf50f57dbad", |
||||
|
"transport": "tcp", |
||||
|
"signature_scheme": "hmac-sha256", |
||||
|
"kernel_name": "" |
||||
|
} |
@ -0,0 +1,12 @@ |
|||||
|
{ |
||||
|
"shell_port": 5355, |
||||
|
"iopub_port": 5362, |
||||
|
"stdin_port": 5356, |
||||
|
"control_port": 5358, |
||||
|
"hb_port": 5366, |
||||
|
"ip": "127.0.0.1", |
||||
|
"key": "de89d28a-7beb5da33100363d2c20fd6b", |
||||
|
"transport": "tcp", |
||||
|
"signature_scheme": "hmac-sha256", |
||||
|
"kernel_name": "" |
||||
|
} |
@ -0,0 +1,12 @@ |
|||||
|
{ |
||||
|
"shell_port": 3079, |
||||
|
"iopub_port": 3085, |
||||
|
"stdin_port": 3080, |
||||
|
"control_port": 3081, |
||||
|
"hb_port": 3089, |
||||
|
"ip": "127.0.0.1", |
||||
|
"key": "1825b8a3-a33137bc69e3375f26f384a3", |
||||
|
"transport": "tcp", |
||||
|
"signature_scheme": "hmac-sha256", |
||||
|
"kernel_name": "" |
||||
|
} |
@ -0,0 +1,12 @@ |
|||||
|
{ |
||||
|
"shell_port": 36740, |
||||
|
"iopub_port": 36746, |
||||
|
"stdin_port": 36741, |
||||
|
"control_port": 36742, |
||||
|
"hb_port": 36750, |
||||
|
"ip": "127.0.0.1", |
||||
|
"key": "ac6de478-4a3be71d79c2c63da7065148", |
||||
|
"transport": "tcp", |
||||
|
"signature_scheme": "hmac-sha256", |
||||
|
"kernel_name": "" |
||||
|
} |
@ -0,0 +1,12 @@ |
|||||
|
{ |
||||
|
"shell_port": 2563, |
||||
|
"iopub_port": 2569, |
||||
|
"stdin_port": 2564, |
||||
|
"control_port": 2565, |
||||
|
"hb_port": 2573, |
||||
|
"ip": "127.0.0.1", |
||||
|
"key": "7e020774-be96933cbe5aaad90c1c9bfc", |
||||
|
"transport": "tcp", |
||||
|
"signature_scheme": "hmac-sha256", |
||||
|
"kernel_name": "" |
||||
|
} |
@ -0,0 +1,12 @@ |
|||||
|
{ |
||||
|
"shell_port": 5840, |
||||
|
"iopub_port": 5846, |
||||
|
"stdin_port": 5841, |
||||
|
"control_port": 5842, |
||||
|
"hb_port": 5850, |
||||
|
"ip": "127.0.0.1", |
||||
|
"key": "e4c27d68-1c3a9dfa16551f35481b05b8", |
||||
|
"transport": "tcp", |
||||
|
"signature_scheme": "hmac-sha256", |
||||
|
"kernel_name": "" |
||||
|
} |
@ -0,0 +1,3 @@ |
|||||
|
|
||||
|
from ipykernel import kernelapp as app |
||||
|
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||||
|
|
||||
|
from ipykernel import kernelapp as app |
||||
|
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||||
|
|
||||
|
from ipykernel import kernelapp as app |
||||
|
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||||
|
|
||||
|
from ipykernel import kernelapp as app |
||||
|
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||||
|
|
||||
|
from ipykernel import kernelapp as app |
||||
|
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||||
|
|
||||
|
from ipykernel import kernelapp as app |
||||
|
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||||
|
|
||||
|
from ipykernel import kernelapp as app |
||||
|
app.launch_new_instance() |
After Width: | Height: | Size: 2.8 MiB |
@ -0,0 +1,140 @@ |
|||||
|
from docx import Document |
||||
|
from paddlenlp import Taskflow |
||||
|
from pprint import pprint |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import re |
||||
|
import json_repair |
||||
|
import time |
||||
|
import math |
||||
|
tagTask = Taskflow("ner") |
||||
|
prompt=''' |
||||
|
.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
不做过多的解释,严格按回答格式作答; |
||||
|
''' |
||||
|
# prompt=''' |
||||
|
# .请回答以上问题, |
||||
|
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
# 不做过多的解释,严格按回答格式作答; |
||||
|
# ''' |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
||||
|
) |
||||
|
#获取全文内容 |
||||
|
def getDocxToTextAll(name): |
||||
|
docxPath=name |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
# print("非空") |
||||
|
words.append(text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
print("placeNameTask",len(words)) |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file: |
||||
|
txt_file.write(text) |
||||
|
|
||||
|
#得到全文和地名有关的内容 |
||||
|
def placeNameTask(text): |
||||
|
batchNum=20 |
||||
|
sentences = re.split(r'[。\n]', text) |
||||
|
# 去掉空字符 |
||||
|
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
# 计算总字符数 |
||||
|
total_chars = len(sentences) |
||||
|
|
||||
|
# 计算有多少份 |
||||
|
num_chunks = math.ceil(total_chars / batchNum) |
||||
|
|
||||
|
# 按batchNum字为一份进行处理 |
||||
|
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
||||
|
placeList = [] |
||||
|
# 打印每一份的内容 |
||||
|
for i, chunk in enumerate(chunks): |
||||
|
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}" |
||||
|
|
||||
|
wenBen=".".join(chunk) |
||||
|
print(chunk) |
||||
|
res = tagTask(wenBen) |
||||
|
isplace = False |
||||
|
for zuhe in res: |
||||
|
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
||||
|
if isplace: |
||||
|
name = placeList[len(placeList) - 1] |
||||
|
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: # or zuhe[1] == "ns" |
||||
|
isplace = True |
||||
|
new_text = zuhe[0].replace("\n", "") |
||||
|
placeList[len(placeList) - 1] = name + new_text |
||||
|
continue |
||||
|
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: |
||||
|
isplace = True |
||||
|
new_text = zuhe[0].replace("\n", "") |
||||
|
placeList.append(new_text) |
||||
|
else: |
||||
|
isplace = False |
||||
|
print("-" * 40) |
||||
|
# 打印总份数 |
||||
|
yield "文档地名检查---文档解析完成" |
||||
|
placeList=list(dict.fromkeys(placeList)) |
||||
|
yield placeList |
||||
|
#主方法 |
||||
|
def checkPlaceName(filename): |
||||
|
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送 |
||||
|
getDocxToTextAll(filename) |
||||
|
with open("checkPlaceName.txt", "r",encoding='utf-8') as f: |
||||
|
gettext = f.read() |
||||
|
yield f"文档地名检查---开始解析文档..." # 每次生成一个数字就发送 |
||||
|
# propnList=placeNameTask(gettext) |
||||
|
for item in placeNameTask(gettext): |
||||
|
if isinstance(item, str): |
||||
|
yield item |
||||
|
else: |
||||
|
final_list = item # 获取最终结果 |
||||
|
propnStr = ",".join(final_list) |
||||
|
print("placeNameTask",propnStr) |
||||
|
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}] |
||||
|
runList = [] |
||||
|
yield f"文档地名检查---结果生成中..." # 每次生成一个数字就发送 |
||||
|
cishu=0 |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
if cishu>3: |
||||
|
cishu=0 |
||||
|
yield "文档地名检查---结果生成中"+'.'*cishu |
||||
|
cishu+=1 |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
print("placeNameTask",data) |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
|
||||
|
# 如果需要进一步操作,例如只关注“正确”的回答 |
||||
|
error_places = [place for place in parsed_data if place['回答'] == '错误'] |
||||
|
print("placeNameTask",error_places) |
||||
|
returnInfo = "发现异常地名<br />"; |
||||
|
if len(error_places)>0: |
||||
|
for t in error_places: |
||||
|
keyword= t['placeName'] |
||||
|
# 查找包含关键字的段落 |
||||
|
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
||||
|
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","") |
||||
|
returnInfo+="原文:" + yuanwen + "<br />出现异常地名:**" + keyword + "**!请注意" + "<br />"; |
||||
|
yield returnInfo |
||||
|
print(returnInfo) |
||||
|
else: |
||||
|
yield "**未发现发现异常地名**" |
@ -0,0 +1,118 @@ |
|||||
|
import re |
||||
|
import time |
||||
|
from docx import Document |
||||
|
from pprint import pprint |
||||
|
# from paddlenlp import Taskflow |
||||
|
# |
||||
|
# similarity = Taskflow("text_similarity", truncation=True, max_length=102400) |
||||
|
|
||||
|
|
||||
|
def getOutlineLevel(inputXml): |
||||
|
""" |
||||
|
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
||||
|
参数 inputXml |
||||
|
返回 number |
||||
|
""" |
||||
|
start_index = inputXml.find('<w:outlineLvl') |
||||
|
end_index = inputXml.find('>', start_index) |
||||
|
number = inputXml[start_index:end_index + 1] |
||||
|
number = re.search("\d+", number).group() |
||||
|
return number |
||||
|
|
||||
|
|
||||
|
def isTitle(paragraph): |
||||
|
""" |
||||
|
功能 判断该段落是否设置了大纲等级 |
||||
|
参数 paragraph:段落 |
||||
|
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
||||
|
""" |
||||
|
# 如果是空行,直接返回None |
||||
|
if paragraph.text.strip() == '': |
||||
|
return None |
||||
|
|
||||
|
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
||||
|
paragraphXml = paragraph._p.xml |
||||
|
if paragraphXml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(paragraphXml) |
||||
|
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
||||
|
targetStyle = paragraph.style |
||||
|
while targetStyle is not None: |
||||
|
# 如果在该级style中找到了大纲级别,返回 |
||||
|
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(targetStyle.element.xml) |
||||
|
else: |
||||
|
targetStyle = targetStyle.base_style |
||||
|
# 如果在段落、样式里都没有找到大纲级别,返回None |
||||
|
return None |
||||
|
|
||||
|
def getDocxToText12biaoti(name): |
||||
|
document = Document(name) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
firstTitle = 0 |
||||
|
secondTitle = 0 |
||||
|
sanjiTitle = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
|
||||
|
if text.strip():#非空判断 |
||||
|
# print("非空") |
||||
|
# words.append(text) |
||||
|
level = isTitle(paragraph) |
||||
|
if level=="0": |
||||
|
firstTitle+=1 |
||||
|
secondTitle = 0 |
||||
|
if(text.find("附件")>=0): |
||||
|
continue |
||||
|
words.append("{}:".format(firstTitle)+text) |
||||
|
elif level=="1": |
||||
|
secondTitle+=1 |
||||
|
sanjiTitle=0 |
||||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
||||
|
words.append("{}.{}".format(firstTitle,secondTitle)+text) |
||||
|
elif level=="2": |
||||
|
sanjiTitle += 1 |
||||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
||||
|
words.append("{}.{}.{}".format(firstTitle, secondTitle,sanjiTitle) + text) |
||||
|
|
||||
|
|
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
print(len(words)) |
||||
|
if len(words)==0: |
||||
|
raise Exception("I know python!") |
||||
|
text = '\n'.join(words) |
||||
|
with open("ce1.txt", 'w',encoding="utf-8") as txt_file: |
||||
|
txt_file.write(text) |
||||
|
return words |
||||
|
mobanList=[] |
||||
|
dangqianList=[] |
||||
|
errorList =[] |
||||
|
# 将文本写入txt文件 |
||||
|
# with open("ce模板.txt", 'r',encoding="utf-8") as txt_file: |
||||
|
# for i in txt_file: |
||||
|
# i=re.sub(r'[\t\n]', '', i) |
||||
|
# mobanList.append(i) |
||||
|
# pprint(mobanList) |
||||
|
# dangqianList=getDocxToText12biaoti("1.docx") |
||||
|
# if len(dangqianList)!=len(mobanList): |
||||
|
# print("标题数量与模板不一致") |
||||
|
# for num in range(len(mobanList)): |
||||
|
# moban = mobanList[num] |
||||
|
# dangqian= dangqianList[num] |
||||
|
# fenshu=similarity([[dangqian,moban]]) |
||||
|
# pprint(fenshu) |
||||
|
# if (fenshu[0]["similarity"]<0.85): |
||||
|
# errorList.append(dangqianList) |
||||
|
# getDocxToText12biaoti("1.docx") |
||||
|
# pprint(errorList) |
||||
|
|
||||
|
prompt = '''{}这是文档大纲,根据大纲分析文档中是否有{}这块内容的描述,若不存在请回答不存在 |
||||
|
''' |
||||
|
dagang ="1" |
||||
|
biaozhun="2" |
||||
|
print(prompt.format(dagang, biaozhun)) |
@ -0,0 +1,282 @@ |
|||||
|
import re |
||||
|
import os |
||||
|
import docx |
||||
|
from docx.document import Document |
||||
|
from docx.text.paragraph import Paragraph |
||||
|
from docx.parts.image import ImagePart |
||||
|
from qwen_agent.agents import Assistant |
||||
|
|
||||
|
from docx.oxml.table import CT_Tbl |
||||
|
from docx.oxml.text.paragraph import CT_P |
||||
|
|
||||
|
import shutil |
||||
|
import re |
||||
|
import json_repair |
||||
|
import uuid |
||||
|
|
||||
|
# 记录程序开始的时间戳 |
||||
|
def getOutlineLevel(inputXml): |
||||
|
""" |
||||
|
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
||||
|
参数 inputXml |
||||
|
返回 number |
||||
|
""" |
||||
|
start_index = inputXml.find('<w:outlineLvl') |
||||
|
end_index = inputXml.find('>', start_index) |
||||
|
number = inputXml[start_index:end_index + 1] |
||||
|
number = re.search("\d+", number).group() |
||||
|
return number |
||||
|
|
||||
|
|
||||
|
def isTitle(paragraph): |
||||
|
""" |
||||
|
功能 判断该段落是否设置了大纲等级 |
||||
|
参数 paragraph:段落 |
||||
|
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
||||
|
""" |
||||
|
# 如果是空行,直接返回None |
||||
|
if paragraph.text.strip() == '': |
||||
|
return None |
||||
|
|
||||
|
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
||||
|
paragraphXml = paragraph._p.xml |
||||
|
if paragraphXml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(paragraphXml) |
||||
|
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
||||
|
targetStyle = paragraph.style |
||||
|
while targetStyle is not None: |
||||
|
# 如果在该级style中找到了大纲级别,返回 |
||||
|
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(targetStyle.element.xml) |
||||
|
else: |
||||
|
targetStyle = targetStyle.base_style |
||||
|
# 如果在段落、样式里都没有找到大纲级别,返回None |
||||
|
return None |
||||
|
|
||||
|
|
||||
|
# 该行只能有一个图片 |
||||
|
def is_image(graph: Paragraph, doc: Document): |
||||
|
images = graph._element.xpath('.//pic:pic') # 获取所有图片 |
||||
|
for image in images: |
||||
|
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id |
||||
|
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 |
||||
|
if isinstance(part, ImagePart): |
||||
|
return True |
||||
|
return False |
||||
|
|
||||
|
|
||||
|
# 获取图片(该行只能有一个图片) |
||||
|
def get_ImagePart(graph: Paragraph, doc: Document): |
||||
|
images = graph._element.xpath('.//pic:pic') # 获取所有图片 |
||||
|
for image in images: |
||||
|
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id |
||||
|
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 |
||||
|
if isinstance(part, ImagePart): |
||||
|
return part |
||||
|
return None |
||||
|
#寻找标题名称 |
||||
|
def findTitleName(docxPath): |
||||
|
yield '文档图片信息检查----检查是否存在详细设计方案' |
||||
|
document = docx.Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
titleWords=[] |
||||
|
firstTitle = 0 |
||||
|
secondTitle = 0 |
||||
|
sanjiTitle = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
level = isTitle(paragraph) |
||||
|
if level=="0": |
||||
|
firstTitle+=1 |
||||
|
secondTitle = 0 |
||||
|
if(text.find("附件")>=0): |
||||
|
continue |
||||
|
titleWords.append("一级标题:".format(firstTitle)+text) |
||||
|
elif level=="1": |
||||
|
secondTitle+=1 |
||||
|
sanjiTitle=0 |
||||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
||||
|
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text) |
||||
|
elif level=="2": |
||||
|
sanjiTitle += 1 |
||||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
||||
|
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) |
||||
|
findTitleName_llm_cfg = { |
||||
|
# 'model':"qwen2-72b", |
||||
|
# 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
'model': "qwen2-72b-instruct", |
||||
|
'model_server': 'DashScope', # base_url, also known as api_base |
||||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, |
||||
|
name='Assistant', |
||||
|
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' |
||||
|
) |
||||
|
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 |
||||
|
类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择 |
||||
|
类似设计方案,服务方案,建设方案为次相关,次级选择 |
||||
|
类似方案是最后选择 |
||||
|
按照这样的顺序选择最合适的 |
||||
|
你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答 |
||||
|
''' |
||||
|
# print("\n".join(titleWords)+prompt) |
||||
|
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})] |
||||
|
runList=[] |
||||
|
for rsp in findTitleName_bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
print(parsed_data) |
||||
|
if(parsed_data["answer"]=="存在"): |
||||
|
print("存在",parsed_data["name"]) |
||||
|
yield parsed_data["name"] |
||||
|
else: |
||||
|
print("不存在",parsed_data["name"]) |
||||
|
yield "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查" |
||||
|
def saveImage(fileName,titleName,imagePath): |
||||
|
fristName="" |
||||
|
doc = docx.Document(fileName) |
||||
|
for paragraph in doc.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip(): # 非空判断 |
||||
|
level = isTitle(paragraph) |
||||
|
if level == "0": |
||||
|
fristName = text |
||||
|
print(text) |
||||
|
if level: |
||||
|
levelText = f"{int(level) + 1}级标题-" + text |
||||
|
else: |
||||
|
# 空说明是表格或者图片 |
||||
|
r = is_image(paragraph, doc) |
||||
|
if r and fristName == titleName: |
||||
|
part = get_ImagePart(paragraph, doc) |
||||
|
img_name = levelText+"_"+ os.path.basename(part.partname) |
||||
|
with open(f'{imagePath}/{img_name}', "wb") as f: |
||||
|
f.write(part.blob) |
||||
|
#保存完成后,上传大模型进行分析 |
||||
|
def checkImageText(filename): |
||||
|
llm_cfg_vl = { |
||||
|
#'model': 'qwen1.5-72b-chat',qwen2-72b-instruct |
||||
|
'model':"qwen-vl-max", |
||||
|
'model_server': 'DashScope', # base_url, also known as api_base |
||||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
botImage = Assistant(llm=llm_cfg_vl, |
||||
|
name='Assistant', |
||||
|
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4 |
||||
|
) |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b-instruct", |
||||
|
'model_server': 'DashScope', # base_url, also known as api_base |
||||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
||||
|
|
||||
|
) |
||||
|
for titleName in findTitleName(filename): |
||||
|
yield titleName |
||||
|
if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"): |
||||
|
yield "文档图片信息检查----文档内容解析中" |
||||
|
imagePath = "Image" + str(uuid.uuid4()) |
||||
|
os.mkdir(imagePath) |
||||
|
saveImage(filename,titleName,imagePath) |
||||
|
imagePathList = os.listdir(imagePath) |
||||
|
count = 0 |
||||
|
resMap={} |
||||
|
for image in imagePathList: |
||||
|
count+=1 |
||||
|
yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}" |
||||
|
outpath=os.path.join("imagePath", image) |
||||
|
print(outpath) |
||||
|
messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}] |
||||
|
runListImage = [] |
||||
|
for rsp in botImage.run(messagesImage): |
||||
|
runListImage.append(rsp) |
||||
|
data = runListImage[len(runListImage) - 1][0]["content"] |
||||
|
print(str(data)) |
||||
|
prompt=''' |
||||
|
依次上述内容是否与文档有关,你只能在[无关,有关]选项中选择答案, |
||||
|
按照这样的格式回答[{“text”:“内容”,"answer":"答案"},{“text”:“内容”,"answer":"答案"}]不做过多的解释,严格按回答格式作答 |
||||
|
''' |
||||
|
messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}] |
||||
|
runList = [] |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
textdata = runList[len(runList) - 1][0]["content"] |
||||
|
print(textdata) |
||||
|
parsed_data = json_repair.loads(textdata) |
||||
|
print(parsed_data) |
||||
|
for res in parsed_data: |
||||
|
if (res["answer"] == "无关"): |
||||
|
print("无关", res["name"]) |
||||
|
map = resMap.get(image) |
||||
|
if map: |
||||
|
#存在map说明之前已经保存过了 |
||||
|
resMap[image]=map+","+res["text"] |
||||
|
else: |
||||
|
resMap[image]=res["text"] |
||||
|
out='' |
||||
|
if(len(resMap)>0): |
||||
|
for key,value in resMap: |
||||
|
out+=f"在{key}图片中,{value}以上内容在文档中未出现相关描述<br>" |
||||
|
yield out |
||||
|
else: |
||||
|
yield "文档图片信息检查----图文符合要求" |
||||
|
shutil.rmtree(imagePath) |
||||
|
# except Exception as e: |
||||
|
# yield f"文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查" |
||||
|
# return |
||||
|
for i in checkImageText("1.docx"): |
||||
|
print(i) |
||||
|
# import docx |
||||
|
# doc = docx.Document('1.docx') |
||||
|
# dict_rel = doc.part._rels # rels其实是个目录 |
||||
|
# for rel in dict_rel: |
||||
|
# rel = dict_rel[rel] |
||||
|
# print("rel", rel.target_ref) |
||||
|
# if "image" in rel.target_ref: |
||||
|
# # create_dir(desc_path) |
||||
|
# img_name = re.findall("/(.*)", rel.target_ref)[0] # windos:/ |
||||
|
# print("img_name", img_name) |
||||
|
# word_name = os.path.splitext("1.docx")[0] |
||||
|
# print("word_name", word_name) |
||||
|
# #检查文件路径分隔符(os.sep),并根据不同的操作系统(Windows或Unix/Linux)处理文件名。 |
||||
|
# if os.sep in word_name: |
||||
|
# new_name = word_name.split('\\')[-1] |
||||
|
# else: |
||||
|
# new_name = word_name.split('/')[-1] |
||||
|
# img_name = f'{new_name}_{img_name}' |
||||
|
# print(img_name) |
||||
|
# desc_path='workspace' |
||||
|
# with open(f'{desc_path}/{img_name}', "wb") as f: |
||||
|
# f.write(rel.target_part.blob) |
||||
|
# # |
||||
|
# # # prompt=''' |
||||
|
# # # .根据上述文本判断,是否为非泛化的公司或组织名称,你可以使用工具利用互联网查询,你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校,委员单位]选项中选择答案,回答格式[{“placeName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
# # # ''' |
||||
|
# llm_cfg_vl = { |
||||
|
# #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct |
||||
|
# 'model':"qwen-vl-max", |
||||
|
# 'model_server': 'DashScope', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
# } |
||||
|
# botvl = Assistant(llm=llm_cfg_vl, |
||||
|
# name='Assistant', |
||||
|
# # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4 |
||||
|
# ) |
||||
|
# messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}] |
||||
|
# runList = [] |
||||
|
# for rsp in botvl.run(messages): |
||||
|
# runList.append(rsp) |
||||
|
# print(rsp) |
||||
|
# data = runList[len(runList) - 1][0]["content"] |
||||
|
# print(str(data)) |
||||
|
|
@ -0,0 +1,133 @@ |
|||||
|
# -*- coding:utf-8 -*- |
||||
|
import time |
||||
|
from docx import Document |
||||
|
from paddlenlp import Taskflow |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import re |
||||
|
import json_repair |
||||
|
wordtag = Taskflow("knowledge_mining") |
||||
|
|
||||
|
prompt = ''' |
||||
|
.根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询, |
||||
|
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案, |
||||
|
回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
''' |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具" |
||||
|
) |
||||
|
|
||||
|
def getDocxToTextAll(name): |
||||
|
docxPath=name |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
# print("非空") |
||||
|
words.append(text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
print("checkCompanyName",len(words)) |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file: |
||||
|
txt_file.write(text) |
||||
|
def checkCompanyName(filename): |
||||
|
getDocxToTextAll(filename) |
||||
|
start_time=time.time() |
||||
|
error_places = [] |
||||
|
for batch in read_file_in_batches('checkCompanyName.txt'): |
||||
|
res=process_batch(batch) |
||||
|
if(len(res)>0): |
||||
|
error_places.extend(res) |
||||
|
|
||||
|
print(error_places) |
||||
|
end_time = time.time() |
||||
|
# 计算执行时间 |
||||
|
elapsed_time = end_time - start_time |
||||
|
print(f"checkCompanyName程序执行时间: {elapsed_time} 秒") |
||||
|
return error_places |
||||
|
|
||||
|
def read_file_in_batches(file_path, batch_size=5000): |
||||
|
""" |
||||
|
分批读取文本文件 |
||||
|
:param file_path: 文件路径 |
||||
|
:param batch_size: 每批处理的字符数 |
||||
|
:return: 生成器,每次返回一批文本 |
||||
|
""" |
||||
|
with open(file_path, 'r', encoding='utf-8') as file: |
||||
|
batch = [] |
||||
|
char_count = 0 |
||||
|
for line in file: |
||||
|
batch.append(line) |
||||
|
char_count += len(line) |
||||
|
if char_count >= batch_size: |
||||
|
yield ''.join(batch) |
||||
|
batch = [] |
||||
|
char_count = 0 |
||||
|
if batch: |
||||
|
yield ''.join(batch) |
||||
|
|
||||
|
def process_batch(batch): |
||||
|
""" |
||||
|
处理一批文本 |
||||
|
:param batch: 一批文本 |
||||
|
""" |
||||
|
# 在这里添加你的处理逻辑 |
||||
|
|
||||
|
# sentences = re.split(r'[。\n]', batch) |
||||
|
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
res=wordtag(batch) |
||||
|
placeList = [] |
||||
|
isplace = False |
||||
|
for zuhe in res[0]['items']: |
||||
|
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
||||
|
zhi = zuhe.get("wordtag_label") |
||||
|
if isplace: |
||||
|
name = placeList[len(placeList) - 1] |
||||
|
if zhi.find("组织机构类")>=0 : # or zuhe[1] == "ns" |
||||
|
isplace = True |
||||
|
new_text = zuhe['item'].replace("\n", "") |
||||
|
placeList[len(placeList) - 1] = name + new_text |
||||
|
continue |
||||
|
if zhi.find("组织机构类")>=0 : |
||||
|
isplace = True |
||||
|
new_text = zuhe['item'].replace("\n", "") |
||||
|
placeList.append(new_text) |
||||
|
else: |
||||
|
isplace = False |
||||
|
placeList=list(dict.fromkeys(placeList)) |
||||
|
placeStr = ",".join(placeList) |
||||
|
messages = [{'role': 'user', 'content': [{'text': placeStr+prompt}]}] |
||||
|
print("checkCompanyName",placeStr+prompt) |
||||
|
runList = [] |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
print("checkCompanyName",data) |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
error_places = [place for place in parsed_data if place['回答'] == '具体的公司或组织名称'] |
||||
|
print("checkCompanyName",error_places) |
||||
|
if len(error_places)>0: |
||||
|
for t in error_places: |
||||
|
keyword= t['companyName'] |
||||
|
# 查找包含关键字的段落 |
||||
|
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch) |
||||
|
t["yuanwen"]=paragraphs[0] |
||||
|
return error_places |
||||
|
else: |
||||
|
return error_places |
@ -0,0 +1,226 @@ |
|||||
|
#-*- coding:utf-8 -*- |
||||
|
# from pycorrector import MacBertCorrector |
||||
|
# m = MacBertCorrector("shibing624/macbert4csc-base-chinese") |
||||
|
from qwen_agent.agents import Assistant |
||||
|
from docx import Document |
||||
|
from pprint import pprint |
||||
|
import re |
||||
|
from paddlenlp import Taskflow |
||||
|
import json |
||||
|
import time |
||||
|
import json_repair |
||||
|
print(json_repair.loads('{"name":""aaaa"}')) |
||||
|
start_time = time.time() |
||||
|
corrector = Taskflow("text_correction") |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
||||
|
|
||||
|
) |
||||
|
# prompt=''' |
||||
|
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案, |
||||
|
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
# ''' |
||||
|
prompt=''' |
||||
|
请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出解析,没有错则不用给解析 |
||||
|
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","jianyi","解析"},{"placeName":"序号","回答":"答案","jianyi","解析"}],不做过多的解释,严格按回答格式作答; |
||||
|
''' |
||||
|
def getDocxToTextAll(name): |
||||
|
docxPath=name |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
# print("非空") |
||||
|
words.append(text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
print("checkDocumentError",len(words)) |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file: |
||||
|
txt_file.write(text) |
||||
|
def getDocumentError(filename): |
||||
|
getDocxToTextAll(filename) |
||||
|
error_places = [] |
||||
|
# # 打开文件 |
||||
|
for batch in read_file_in_batches('checkDocumentError.txt'): |
||||
|
res=process_batch(batch) |
||||
|
if(len(res)>0): |
||||
|
error_places.extend(res) |
||||
|
|
||||
|
pprint(error_places) |
||||
|
end_time = time.time() |
||||
|
# 计算执行时间 |
||||
|
elapsed_time = end_time - start_time |
||||
|
print(f"checkDocumentError程序执行时间: {elapsed_time} 秒") |
||||
|
return error_places |
||||
|
# |
||||
|
# 过滤掉填充的None(如果有的话) |
||||
|
# chunk = [line for line in chunk if line is not None] |
||||
|
# res = m.correct_batch(sentences) |
||||
|
# print("DocumentError",res) |
||||
|
# lines_with_greeting = [place for place in res if len( place['errors'])>0] |
||||
|
# error_places.extend(lines_with_greeting) |
||||
|
# pprint(error_places) |
||||
|
# if len(lines_with_greeting)>0: |
||||
|
# for t in error_places: |
||||
|
# keyword= t['source'] |
||||
|
# |
||||
|
# errorWord=t["errors"] |
||||
|
# # 查找包含关键字的段落 |
||||
|
# paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
||||
|
# t["yuanwen"]=paragraphs[0] |
||||
|
# return error_places |
||||
|
# else: |
||||
|
# return error_places |
||||
|
# return lines_with_greeting |
||||
|
def read_file_in_batches(file_path, batch_size=5000): |
||||
|
""" |
||||
|
分批读取文本文件 |
||||
|
:param file_path: 文件路径 |
||||
|
:param batch_size: 每批处理的字符数 |
||||
|
:return: 生成器,每次返回一批文本 |
||||
|
""" |
||||
|
with open(file_path, 'r', encoding='utf-8') as file: |
||||
|
batch = [] |
||||
|
char_count = 0 |
||||
|
for line in file: |
||||
|
batch.append(line) |
||||
|
char_count += len(line) |
||||
|
if char_count >= batch_size: |
||||
|
yield ''.join(batch) |
||||
|
batch = [] |
||||
|
char_count = 0 |
||||
|
if batch: |
||||
|
yield ''.join(batch) |
||||
|
|
||||
|
def process_batch(batch): |
||||
|
""" |
||||
|
处理一批文本 |
||||
|
:param batch: 一批文本 |
||||
|
""" |
||||
|
# 在这里添加你的处理逻辑 |
||||
|
# error_places=[] |
||||
|
sentences = re.split(r'[。\n]', batch) |
||||
|
# 去掉空字符串 |
||||
|
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
res = corrector(sentences) |
||||
|
lines_with_greeting = [place for place in res if len(place['errors']) > 0] |
||||
|
# error_places.extend(lines_with_greeting) |
||||
|
# pprint(error_places) |
||||
|
words='' |
||||
|
err=[] |
||||
|
if len(lines_with_greeting) > 0: |
||||
|
num=0 |
||||
|
wenti=[]#记录问题的数组 |
||||
|
keyword_list = []#记录问题 |
||||
|
for t in lines_with_greeting: |
||||
|
temp_errorWords = [] |
||||
|
keyword = t['source'] |
||||
|
keyword_list.append(keyword) |
||||
|
for item in t["errors"]: |
||||
|
for key, value in item['correction'].items(): |
||||
|
temp_errorWords.append(key) |
||||
|
wenti.append("{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num,keyword,",".join(temp_errorWords))) |
||||
|
num+=1 |
||||
|
words ="\n".join(wenti) |
||||
|
|
||||
|
messages = [{'role': 'user', 'content': [{'text': words+ prompt}]}] |
||||
|
runList = [] |
||||
|
print(words+ prompt) |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
pprint(data) |
||||
|
parsed_data = json_repair.loads(data.replace("\\","").replace('`', '')) |
||||
|
err = [ |
||||
|
{**place, "placeName": keyword_list[int(place["placeName"])],"jianyi":place["解析"]} |
||||
|
for place in parsed_data |
||||
|
if place['回答'] == '是' |
||||
|
] |
||||
|
pprint(err) |
||||
|
# err = [place["placeName"]=keyword_list[int(place["placeName"])] for place in parsed_data if place['回答'] == '是'] |
||||
|
# if len(err) > 0: |
||||
|
# # for t in error_places: |
||||
|
# # keyword = t['placeName'] |
||||
|
# # # 查找包含关键字的段落 |
||||
|
# # paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
||||
|
# # t["yuanwen"] = paragraphs[0] |
||||
|
# return err |
||||
|
# else: |
||||
|
return err |
||||
|
|
||||
|
# from flask import Flask, request, jsonify |
||||
|
# import os |
||||
|
# # from checkPlaceName import checkPlaceName |
||||
|
# # from checkRepeatText import checkRepeatText |
||||
|
# # from checkCompanyName import checkCompanyName |
||||
|
# # from documentError import getDocumentError |
||||
|
# app = Flask(__name__) |
||||
|
# UPLOAD_FOLDER = 'uploads' |
||||
|
# if not os.path.exists(UPLOAD_FOLDER): |
||||
|
# os.makedirs(UPLOAD_FOLDER) |
||||
|
# @app.route('/upload', methods=['POST']) |
||||
|
# def upload_file(): |
||||
|
# if 'file' not in request.files: |
||||
|
# return jsonify({"error": "No file part"}), 400 |
||||
|
# file = request.files['file'] |
||||
|
# if file.filename == '': |
||||
|
# return jsonify({"error": "No selected file"}), 400 |
||||
|
# if file: |
||||
|
# filename = file.filename |
||||
|
# file.save(os.path.join(UPLOAD_FOLDER,filename)) |
||||
|
# return jsonify({"message": "File uploaded successfully"}), 200 |
||||
|
# # @app.route('/checkPlaceName/<filename>', methods=['GET']) |
||||
|
# # def checkPlaceNameWeb(filename): |
||||
|
# # return checkPlaceName(filename) |
||||
|
# # @app.route('/checkRepeatText/<filename>', methods=['GET']) |
||||
|
# # def checkRepeatTextWeb(filename): |
||||
|
# # return checkRepeatText(filename) |
||||
|
# # @app.route('/checkCompanyName/<filename>', methods=['GET']) |
||||
|
# # def checkCompanyNameWeb(filename): |
||||
|
# # return checkCompanyName(filename) |
||||
|
# # @app.route('/checkDocumentErrorWeb/<filename>', methods=['GET']) |
||||
|
# # def checkDocumentErrorWeb(filename): |
||||
|
# # return getDocumentError(filename) |
||||
|
# if __name__ == '__main__': |
||||
|
# app.run(host='0.0.0.0',port=80) |
||||
|
# from transformers import AutoTokenizer, AutoModel, GenerationConfig,AutoModelForCausalLM |
||||
|
# import os |
||||
|
# os.environ['NPU_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7' |
||||
|
# os.environ['ASCEND_RT_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7' |
||||
|
# import torch |
||||
|
# import torch_npu |
||||
|
# from torch_npu.contrib import transfer_to_npu |
||||
|
|
||||
|
# from accelerate import Accelerator |
||||
|
|
||||
|
# # device = 'cpu' |
||||
|
# accelerator = Accelerator() |
||||
|
# # torch_device = "npu" # 0~7 |
||||
|
# # torch.npu.set_device(torch.device(torch_device)) |
||||
|
# devices = [] |
||||
|
# for i in range(8): |
||||
|
# devices.append(f"npu:{i}") |
||||
|
# print(devices) |
||||
|
# torch.npu.set_device(devices) |
||||
|
# torch.npu.set_compile_mode(jit_compile=False) |
||||
|
# model_name_or_path = '/mnt/sdc/qwen/Qwen2-72B-Instruct' |
||||
|
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) |
||||
|
# # model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, device_map="auto",torch_dtype=torch.float16) |
||||
|
# model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, device_map=accelerator,torch_dtype=torch.float16).npu().eval() |
@ -0,0 +1,153 @@ |
|||||
|
from docx import Document |
||||
|
from paddlenlp import Taskflow |
||||
|
from pprint import pprint |
||||
|
from qwen_agent.agents import Assistant |
||||
|
import re |
||||
|
import json_repair |
||||
|
import time |
||||
|
tagTask = Taskflow("ner") |
||||
|
prompt=''' |
||||
|
.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
不做过多的解释,严格按回答格式作答; |
||||
|
''' |
||||
|
# prompt=''' |
||||
|
# .请回答以上问题, |
||||
|
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
||||
|
# 不做过多的解释,严格按回答格式作答; |
||||
|
# ''' |
||||
|
llm_cfg = { |
||||
|
#'model': 'qwen1.5-72b-chat', |
||||
|
'model':"qwen2-72b", |
||||
|
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
||||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
||||
|
} |
||||
|
bot = Assistant(llm=llm_cfg, |
||||
|
name='Assistant', |
||||
|
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
||||
|
) |
||||
|
#获取全文内容 |
||||
|
def getDocxToTextAll(name): |
||||
|
docxPath=name |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
# print("非空") |
||||
|
words.append(text) |
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
print("placeNameTask",len(words)) |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file: |
||||
|
txt_file.write(text) |
||||
|
|
||||
|
#得到全文和地名有关的内容 |
||||
|
def placeNameTask(text): |
||||
|
res = tagTask(text) |
||||
|
print(res) |
||||
|
placeList = [] |
||||
|
isplace = False |
||||
|
for zuhe in res: |
||||
|
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
||||
|
|
||||
|
if isplace: |
||||
|
name = placeList[len(placeList) - 1] |
||||
|
if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0:# or zuhe[1] == "ns" |
||||
|
isplace = True |
||||
|
new_text = zuhe[0].replace("\n", "") |
||||
|
placeList[len(placeList) - 1] = name + new_text |
||||
|
continue |
||||
|
if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0: |
||||
|
isplace = True |
||||
|
new_text = zuhe[0].replace("\n", "") |
||||
|
placeList.append(new_text) |
||||
|
else: |
||||
|
isplace = False |
||||
|
placeList=list(dict.fromkeys(placeList)) |
||||
|
return placeList |
||||
|
#主方法 |
||||
|
def checkPlaceName(filename): |
||||
|
getDocxToTextAll(filename) |
||||
|
start_time=time.time() |
||||
|
error_places = [] |
||||
|
for batch in read_file_in_batches('checkPlaceName.txt'): |
||||
|
res=process_batch(batch) |
||||
|
if(len(res)>0): |
||||
|
error_places.extend(res) |
||||
|
|
||||
|
pprint(error_places) |
||||
|
end_time = time.time() |
||||
|
# 计算执行时间 |
||||
|
elapsed_time = end_time - start_time |
||||
|
print(f"checkPlaceName程序执行时间: {elapsed_time} 秒") |
||||
|
return error_places |
||||
|
|
||||
|
def read_file_in_batches(file_path, batch_size=5000): |
||||
|
""" |
||||
|
分批读取文本文件 |
||||
|
:param file_path: 文件路径 |
||||
|
:param batch_size: 每批处理的字符数 |
||||
|
:return: 生成器,每次返回一批文本 |
||||
|
""" |
||||
|
with open(file_path, 'r', encoding='utf-8') as file: |
||||
|
batch = [] |
||||
|
char_count = 0 |
||||
|
for line in file: |
||||
|
batch.append(line) |
||||
|
char_count += len(line) |
||||
|
if char_count >= batch_size: |
||||
|
yield ''.join(batch) |
||||
|
batch = [] |
||||
|
char_count = 0 |
||||
|
if batch: |
||||
|
yield ''.join(batch) |
||||
|
|
||||
|
def process_batch(batch): |
||||
|
""" |
||||
|
处理一批文本 |
||||
|
:param batch: 一批文本 |
||||
|
""" |
||||
|
# 在这里添加你的处理逻辑 |
||||
|
|
||||
|
# sentences = re.split(r'[。\n]', batch) |
||||
|
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
||||
|
propnList=placeNameTask(batch) |
||||
|
# words=[] |
||||
|
# for placeName in propnList: |
||||
|
# word="原文:{},先从分析原文是否含有错误地名,若含有错误地名,请回答包含错误地名,若不包含错误地名,请从【具体的公司或组织名称,非具体的公司或组织名称,与政府有关的公司或组织名称,其他组织名称,地名】中选择最合适的一个作为答案".format(placeName) |
||||
|
# words.append(word) |
||||
|
propnStr = ",".join(propnList) |
||||
|
print("placeNameTask",propnStr) |
||||
|
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}] |
||||
|
runList = [] |
||||
|
for rsp in bot.run(messages): |
||||
|
runList.append(rsp) |
||||
|
data = runList[len(runList) - 1][0]["content"] |
||||
|
print("placeNameTask",data) |
||||
|
parsed_data = json_repair.loads(data.replace('`', '')) |
||||
|
|
||||
|
# 遍历列表 |
||||
|
for item in parsed_data: |
||||
|
print(f"地名: {item['placeName']}, 回答: {item['回答']}") |
||||
|
|
||||
|
# 如果需要进一步操作,例如只关注“正确”的回答 |
||||
|
error_places = [place for place in parsed_data if place['回答'] == '错误'] |
||||
|
print("placeNameTask",error_places) |
||||
|
if len(error_places)>0: |
||||
|
for t in error_places: |
||||
|
keyword= t['placeName'] |
||||
|
# 查找包含关键字的段落 |
||||
|
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch) |
||||
|
t["yuanwen"]=paragraphs[0] |
||||
|
return error_places |
||||
|
else: |
||||
|
return error_places |
@ -0,0 +1,160 @@ |
|||||
|
import uuid |
||||
|
from langchain_chroma import Chroma |
||||
|
from langchain_community.embeddings import DashScopeEmbeddings |
||||
|
from langchain_community.document_loaders import TextLoader |
||||
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
||||
|
|
||||
|
from paddlenlp import Taskflow |
||||
|
similarity = Taskflow("text_similarity" , truncation=True,max_length=102400) |
||||
|
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") |
||||
|
vector_store_path="vector_store" |
||||
|
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) |
||||
|
import re |
||||
|
import time |
||||
|
from docx import Document |
||||
|
|
||||
|
# 记录程序开始的时间戳 |
||||
|
def getOutlineLevel(inputXml): |
||||
|
""" |
||||
|
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
||||
|
参数 inputXml |
||||
|
返回 number |
||||
|
""" |
||||
|
start_index = inputXml.find('<w:outlineLvl') |
||||
|
end_index = inputXml.find('>', start_index) |
||||
|
number = inputXml[start_index:end_index + 1] |
||||
|
number = re.search("\d+", number).group() |
||||
|
return number |
||||
|
|
||||
|
|
||||
|
def isTitle(paragraph): |
||||
|
""" |
||||
|
功能 判断该段落是否设置了大纲等级 |
||||
|
参数 paragraph:段落 |
||||
|
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
||||
|
""" |
||||
|
# 如果是空行,直接返回None |
||||
|
if paragraph.text.strip() == '': |
||||
|
return None |
||||
|
|
||||
|
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
||||
|
paragraphXml = paragraph._p.xml |
||||
|
if paragraphXml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(paragraphXml) |
||||
|
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
||||
|
targetStyle = paragraph.style |
||||
|
while targetStyle is not None: |
||||
|
# 如果在该级style中找到了大纲级别,返回 |
||||
|
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
||||
|
return getOutlineLevel(targetStyle.element.xml) |
||||
|
else: |
||||
|
targetStyle = targetStyle.base_style |
||||
|
# 如果在段落、样式里都没有找到大纲级别,返回None |
||||
|
return None |
||||
|
|
||||
|
#获取文档中 详细设计方案 章节的所有内容 |
||||
|
def getDocxToText(docxPath,titleName): |
||||
|
document = Document(docxPath) |
||||
|
# 逐段读取docx文档的内容 |
||||
|
levelList=[] |
||||
|
words=[] |
||||
|
addStart = False |
||||
|
levelText="" |
||||
|
i = 0 |
||||
|
for paragraph in document.paragraphs: |
||||
|
# 判断该段落的标题级别 |
||||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
||||
|
text = paragraph.text |
||||
|
if text.strip():#非空判断 |
||||
|
print("非空") |
||||
|
if titleName: |
||||
|
level = isTitle(paragraph) |
||||
|
if(addStart and level=="0"): |
||||
|
addStart=False |
||||
|
if(level=="0" and text.find(titleName)>=0): |
||||
|
addStart=True |
||||
|
if level: |
||||
|
levelList.append("{}:".format(level)+paragraph.text) |
||||
|
levelText=text |
||||
|
else: |
||||
|
if addStart: |
||||
|
if(text.startswith("图") or text.startswith("注:")): |
||||
|
continue |
||||
|
i=i+1 |
||||
|
words.append("第{}个段落:".format(i)+text) |
||||
|
else: |
||||
|
words.append(text) |
||||
|
|
||||
|
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
||||
|
print("checkRepeatText",len(words)) |
||||
|
if len(words)==0: |
||||
|
raise Exception("I know python!") |
||||
|
text = '\n'.join(words) |
||||
|
|
||||
|
# 将文本写入txt文件 |
||||
|
with open("checkRepeatText.txt", 'w', ) as txt_file: |
||||
|
txt_file.write(text) |
||||
|
time.sleep(3) |
||||
|
loader = TextLoader(file_path='checkRepeatText.txt') |
||||
|
docs = loader.load() |
||||
|
# print(docs) |
||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10, add_start_index=True, |
||||
|
separators=["\n\n", "\n"]) |
||||
|
splits = text_splitter.split_documents(docs) |
||||
|
uuids = [] |
||||
|
print(len(splits)) |
||||
|
for i in range(len(splits)): |
||||
|
uuids.append(str(uuid.uuid4())) |
||||
|
print(len(uuids)) |
||||
|
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) |
||||
|
vectorstore.add_documents(documents=splits, ids=uuids) |
||||
|
while True: |
||||
|
time.sleep(0.3) |
||||
|
ress = vectorstore.similarity_search(words[0]) |
||||
|
if (len(ress) > 0): |
||||
|
break |
||||
|
return words,uuids |
||||
|
|
||||
|
|
||||
|
# @app.route('/checkRepeatText/<filename>', methods=['GET']) |
||||
|
def checkRepeatText(filename,titleName): |
||||
|
words,uuids=getDocxToText(filename,titleName) |
||||
|
try: |
||||
|
# 记录程序开始的时间戳‘ |
||||
|
reslist = [] |
||||
|
count = 0 |
||||
|
for i in words: |
||||
|
count += 1 |
||||
|
result = vectorstore.similarity_search(i) |
||||
|
textTag = i.split(":")[0] |
||||
|
print(i) |
||||
|
for content in result: |
||||
|
text = content.page_content |
||||
|
tag = text.split(":")[0].replace('\n', '') |
||||
|
if (textTag.find(tag) >= 0): |
||||
|
continue |
||||
|
res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) |
||||
|
print(res[0]["similarity"]) |
||||
|
if (res[0]["similarity"] > 0.95): |
||||
|
# 判断重复内容是否被放入 |
||||
|
if (len(reslist) > 0): |
||||
|
isExist = False |
||||
|
for neirong in reslist: |
||||
|
if i[i.find(':') + 1:] in neirong.values(): |
||||
|
isExist = True |
||||
|
break |
||||
|
if not isExist: |
||||
|
reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:]}) |
||||
|
print(reslist) |
||||
|
else: |
||||
|
reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:]}) |
||||
|
print(i.split(":")[1] + "\n" + text.split(":")[1]) |
||||
|
except Exception as e: |
||||
|
print("发生异常:",e) |
||||
|
finally: |
||||
|
# if(count>=300): |
||||
|
# break |
||||
|
vectorstore.delete(ids=uuids) |
||||
|
print("已删除") |
||||
|
print(reslist) |
||||
|
return reslist |
@ -0,0 +1,712 @@ |
|||||
|
""" |
||||
|
This module will parse the JSON file following the BNF definition: |
||||
|
|
||||
|
<json> ::= <container> |
||||
|
|
||||
|
<primitive> ::= <number> | <string> | <boolean> |
||||
|
; Where: |
||||
|
; <number> is a valid real number expressed in one of a number of given formats |
||||
|
; <string> is a string of valid characters enclosed in quotes |
||||
|
; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
||||
|
|
||||
|
<container> ::= <object> | <array> |
||||
|
<array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
||||
|
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
||||
|
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value |
||||
|
|
||||
|
If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string: |
||||
|
- Add the missing parentheses if the parser believes that the array or object should be closed |
||||
|
- Quote strings or add missing single quotes |
||||
|
- Adjust whitespaces and remove line breaks |
||||
|
|
||||
|
All supported use cases are in the unit tests |
||||
|
""" |
||||
|
|
||||
|
import os |
||||
|
import json |
||||
|
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal |
||||
|
|
||||
|
|
||||
|
class StringFileWrapper: |
||||
|
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling |
||||
|
def __init__(self, fd: TextIO) -> None: |
||||
|
self.fd = fd |
||||
|
self.length: int = 0 |
||||
|
|
||||
|
def __getitem__(self, index: Union[int, slice]) -> str: |
||||
|
if isinstance(index, slice): |
||||
|
self.fd.seek(index.start) |
||||
|
value = self.fd.read(index.stop - index.start) |
||||
|
self.fd.seek(index.start) |
||||
|
return value |
||||
|
else: |
||||
|
self.fd.seek(index) |
||||
|
return self.fd.read(1) |
||||
|
|
||||
|
def __len__(self) -> int: |
||||
|
if self.length < 1: |
||||
|
current_position = self.fd.tell() |
||||
|
self.fd.seek(0, os.SEEK_END) |
||||
|
self.length = self.fd.tell() |
||||
|
self.fd.seek(current_position) |
||||
|
return self.length |
||||
|
|
||||
|
|
||||
|
class LoggerConfig: |
||||
|
# This is a type class to simplify the declaration |
||||
|
def __init__(self, log_level: Optional[str]): |
||||
|
self.log: List[Dict[str, str]] = [] |
||||
|
self.window: int = 10 |
||||
|
self.log_level: str = log_level if log_level else "none" |
||||
|
|
||||
|
|
||||
|
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None] |
||||
|
|
||||
|
|
||||
|
class JSONParser: |
||||
|
def __init__( |
||||
|
self, |
||||
|
json_str: Union[str, StringFileWrapper], |
||||
|
json_fd: Optional[TextIO], |
||||
|
logging: Optional[bool], |
||||
|
) -> None: |
||||
|
# The string to parse |
||||
|
self.json_str = json_str |
||||
|
# Alternatively, the file description with a json file in it |
||||
|
if json_fd: |
||||
|
# This is a trick we do to treat the file wrapper as an array |
||||
|
self.json_str = StringFileWrapper(json_fd) |
||||
|
# Index is our iterator that will keep track of which character we are looking at right now |
||||
|
self.index: int = 0 |
||||
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value |
||||
|
self.context: list[str] = [] |
||||
|
# Use this to log the activity, but only if logging is active |
||||
|
self.logger = LoggerConfig(log_level="info" if logging else None) |
||||
|
|
||||
|
def parse( |
||||
|
self, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
json = self.parse_json() |
||||
|
if self.index < len(self.json_str): |
||||
|
self.log( |
||||
|
"The parser returned early, checking if there's more json elements", |
||||
|
"info", |
||||
|
) |
||||
|
json = [json] |
||||
|
last_index = self.index |
||||
|
while self.index < len(self.json_str): |
||||
|
j = self.parse_json() |
||||
|
if j != "": |
||||
|
json.append(j) |
||||
|
if self.index == last_index: |
||||
|
self.index += 1 |
||||
|
last_index = self.index |
||||
|
# If nothing extra was found, don't return an array |
||||
|
if len(json) == 1: |
||||
|
self.log( |
||||
|
"There were no more elements, returning the element without the array", |
||||
|
"info", |
||||
|
) |
||||
|
json = json[0] |
||||
|
if self.logger.log_level == "none": |
||||
|
return json |
||||
|
else: |
||||
|
return json, self.logger.log |
||||
|
|
||||
|
def parse_json( |
||||
|
self, |
||||
|
) -> JSONReturnType: |
||||
|
while True: |
||||
|
char = self.get_char_at() |
||||
|
# This parser will ignore any basic element (string or number) that is not inside an array or object |
||||
|
is_in_context = len(self.context) > 0 |
||||
|
# False means that we are at the end of the string provided |
||||
|
if char is False: |
||||
|
return "" |
||||
|
# <object> starts with '{' |
||||
|
elif char == "{": |
||||
|
self.index += 1 |
||||
|
return self.parse_object() |
||||
|
# <array> starts with '[' |
||||
|
elif char == "[": |
||||
|
self.index += 1 |
||||
|
return self.parse_array() |
||||
|
# there can be an edge case in which a key is empty and at the end of an object |
||||
|
# like "key": }. We return an empty string here to close the object properly |
||||
|
elif char == "}": |
||||
|
self.log( |
||||
|
"At the end of an object we found a key with missing value, skipping", |
||||
|
"info", |
||||
|
) |
||||
|
return "" |
||||
|
# <string> starts with a quote |
||||
|
elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()): |
||||
|
return self.parse_string() |
||||
|
# <number> starts with [0-9] or minus |
||||
|
elif is_in_context and (char.isdigit() or char == "-" or char == "."): |
||||
|
return self.parse_number() |
||||
|
# If everything else fails, we just ignore and move on |
||||
|
else: |
||||
|
self.index += 1 |
||||
|
|
||||
|
def parse_object(self) -> Dict[str, Any]: |
||||
|
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
||||
|
obj = {} |
||||
|
# Stop when you either find the closing parentheses or you have iterated over the entire string |
||||
|
while (self.get_char_at() or "}") != "}": |
||||
|
# This is what we expect to find: |
||||
|
# <member> ::= <string> ': ' <json> |
||||
|
|
||||
|
# Skip filler whitespaces |
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on |
||||
|
if (self.get_char_at() or "") == ":": |
||||
|
self.log( |
||||
|
"While parsing an object we found a : before a key, ignoring", |
||||
|
"info", |
||||
|
) |
||||
|
self.index += 1 |
||||
|
|
||||
|
# We are now searching for they string key |
||||
|
# Context is used in the string parser to manage the lack of quotes |
||||
|
self.set_context("object_key") |
||||
|
|
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# <member> starts with a <string> |
||||
|
key = "" |
||||
|
while self.get_char_at(): |
||||
|
key = str(self.parse_string()) |
||||
|
|
||||
|
if key != "" or (key == "" and self.get_char_at() == ":"): |
||||
|
# If the string is empty but there is a object divider, we are done here |
||||
|
break |
||||
|
|
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# We reached the end here |
||||
|
if (self.get_char_at() or "}") == "}": |
||||
|
continue |
||||
|
|
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
# An extreme case of missing ":" after a key |
||||
|
if (self.get_char_at() or "") != ":": |
||||
|
self.log( |
||||
|
"While parsing an object we missed a : after a key", |
||||
|
"info", |
||||
|
) |
||||
|
|
||||
|
self.index += 1 |
||||
|
self.reset_context() |
||||
|
self.set_context("object_value") |
||||
|
# The value can be any valid json |
||||
|
value = self.parse_json() |
||||
|
|
||||
|
# Reset context since our job is done |
||||
|
self.reset_context() |
||||
|
obj[key] = value |
||||
|
|
||||
|
if (self.get_char_at() or "") in [",", "'", '"']: |
||||
|
self.index += 1 |
||||
|
|
||||
|
# Remove trailing spaces |
||||
|
self.skip_whitespaces_at() |
||||
|
|
||||
|
self.index += 1 |
||||
|
return obj |
||||
|
|
||||
|
def parse_array(self) -> List[Any]: |
||||
|
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
||||
|
arr = [] |
||||
|
self.set_context("array") |
||||
|
# Stop when you either find the closing parentheses or you have iterated over the entire string |
||||
|
while (self.get_char_at() or "]") != "]": |
||||
|
self.skip_whitespaces_at() |
||||
|
value = self.parse_json() |
||||
|
|
||||
|
# It is possible that parse_json() returns nothing valid, so we stop |
||||
|
if value == "": |
||||
|
break |
||||
|
|
||||
|
if value == "..." and self.get_char_at(-1) == ".": |
||||
|
self.log( |
||||
|
"While parsing an array, found a stray '...'; ignoring it", "info" |
||||
|
) |
||||
|
else: |
||||
|
arr.append(value) |
||||
|
|
||||
|
# skip over whitespace after a value but before closing ] |
||||
|
char = self.get_char_at() |
||||
|
while char and (char.isspace() or char == ","): |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
|
||||
|
# Especially at the end of an LLM generated json you might miss the last "]" |
||||
|
char = self.get_char_at() |
||||
|
if char and char != "]": |
||||
|
self.log( |
||||
|
"While parsing an array we missed the closing ], adding it back", "info" |
||||
|
) |
||||
|
self.index -= 1 |
||||
|
|
||||
|
self.index += 1 |
||||
|
self.reset_context() |
||||
|
return arr |
||||
|
|
||||
|
def parse_string(self) -> Union[str, bool, None]: |
||||
|
# <string> is a string of valid characters enclosed in quotes |
||||
|
# i.e. { name: "John" } |
||||
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here |
||||
|
|
||||
|
# Flag to manage corner cases related to missing starting quote |
||||
|
missing_quotes = False |
||||
|
doubled_quotes = False |
||||
|
lstring_delimiter = rstring_delimiter = '"' |
||||
|
|
||||
|
char = self.get_char_at() |
||||
|
# A valid string can only start with a valid quote or, in our case, with a literal |
||||
|
while char and char not in ['"', "'", "“"] and not char.isalnum(): |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
|
||||
|
if not char: |
||||
|
# This is an empty string |
||||
|
return "" |
||||
|
|
||||
|
# Ensuring we use the right delimiter |
||||
|
if char == "'": |
||||
|
lstring_delimiter = rstring_delimiter = "'" |
||||
|
elif char == "“": |
||||
|
lstring_delimiter = "“" |
||||
|
rstring_delimiter = "”" |
||||
|
elif char.isalnum(): |
||||
|
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid |
||||
|
# But remember, object keys are only of type string |
||||
|
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key": |
||||
|
value = self.parse_boolean_or_null() |
||||
|
if value != "": |
||||
|
return value |
||||
|
self.log( |
||||
|
"While parsing a string, we found a literal instead of a quote", |
||||
|
"info", |
||||
|
) |
||||
|
self.log( |
||||
|
"While parsing a string, we found no starting quote. Will add the quote back", |
||||
|
"info", |
||||
|
) |
||||
|
missing_quotes = True |
||||
|
|
||||
|
if not missing_quotes: |
||||
|
self.index += 1 |
||||
|
|
||||
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop |
||||
|
if self.get_char_at() == lstring_delimiter: |
||||
|
# If it's an empty key, this was easy |
||||
|
if self.get_context() == "object_key" and self.get_char_at(1) == ":": |
||||
|
self.index += 1 |
||||
|
return "" |
||||
|
# Find the next delimiter |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# Now check that the next character is also a delimiter to ensure that we have ""....."" |
||||
|
# In that case we ignore this rstring delimiter |
||||
|
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter: |
||||
|
self.log( |
||||
|
"While parsing a string, we found a valid starting doubled quote, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
doubled_quotes = True |
||||
|
self.index += 1 |
||||
|
else: |
||||
|
# Ok this is not a doubled quote, check if this is an empty string or not |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c not in [",", "]", "}"]: |
||||
|
self.log( |
||||
|
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote", |
||||
|
"info", |
||||
|
) |
||||
|
self.index += 1 |
||||
|
|
||||
|
# Initialize our return value |
||||
|
string_acc = "" |
||||
|
|
||||
|
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object |
||||
|
# In that case we need to use the ":|,|}" characters as terminators of the string |
||||
|
# So this will stop if: |
||||
|
# * It finds a closing quote |
||||
|
# * It iterated over the entire sequence |
||||
|
# * If we are fixing missing quotes in an object, when it finds the special terminators |
||||
|
char = self.get_char_at() |
||||
|
while char and char != rstring_delimiter: |
||||
|
if missing_quotes: |
||||
|
if self.get_context() == "object_key" and ( |
||||
|
char == ":" or char.isspace() |
||||
|
): |
||||
|
self.log( |
||||
|
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here", |
||||
|
"info", |
||||
|
) |
||||
|
break |
||||
|
elif self.get_context() == "object_value" and char in [",", "}"]: |
||||
|
rstring_delimiter_missing = True |
||||
|
# check if this is a case in which the closing comma is NOT missing instead |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c and next_c in [",", "}"]: |
||||
|
rstring_delimiter_missing = False |
||||
|
if rstring_delimiter_missing: |
||||
|
self.log( |
||||
|
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here", |
||||
|
"info", |
||||
|
) |
||||
|
break |
||||
|
string_acc += char |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
if char and len(string_acc) > 0 and string_acc[-1] == "\\": |
||||
|
# This is a special case, if people use real strings this might happen |
||||
|
self.log("Found a stray escape sequence, normalizing it", "info") |
||||
|
string_acc = string_acc[:-1] |
||||
|
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]: |
||||
|
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"} |
||||
|
string_acc += escape_seqs.get(char, char) or char |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here |
||||
|
if char == rstring_delimiter: |
||||
|
# Special case here, in case of double quotes one after another |
||||
|
if doubled_quotes and self.get_char_at(1) == rstring_delimiter: |
||||
|
self.log( |
||||
|
"While parsing a string, we found a doubled quote, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
self.index += 1 |
||||
|
elif missing_quotes and self.get_context() == "object_value": |
||||
|
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c not in [ |
||||
|
rstring_delimiter, |
||||
|
lstring_delimiter, |
||||
|
]: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c: |
||||
|
# We found a quote, now let's make sure there's a ":" following |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c and next_c == ":": |
||||
|
# Reset the cursor |
||||
|
self.index -= 1 |
||||
|
char = self.get_char_at() |
||||
|
self.log( |
||||
|
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.", |
||||
|
"info", |
||||
|
) |
||||
|
break |
||||
|
else: |
||||
|
# Check if eventually there is a rstring delimiter, otherwise we bail |
||||
|
i = 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
check_comma_in_object_value = True |
||||
|
while next_c and next_c not in [ |
||||
|
rstring_delimiter, |
||||
|
lstring_delimiter, |
||||
|
]: |
||||
|
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas |
||||
|
# This is because the routine after will make sure to correct any bad guess and this solves a corner case |
||||
|
if check_comma_in_object_value and next_c.isalpha(): |
||||
|
check_comma_in_object_value = False |
||||
|
# If we are in an object context, let's check for the right delimiters |
||||
|
if ( |
||||
|
("object_key" in self.context and next_c in [":", "}"]) |
||||
|
or ("object_value" in self.context and next_c == "}") |
||||
|
or ("array" in self.context and next_c in ["]", ","]) |
||||
|
or ( |
||||
|
check_comma_in_object_value |
||||
|
and self.get_context() == "object_value" |
||||
|
and next_c == "," |
||||
|
) |
||||
|
): |
||||
|
break |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string |
||||
|
if next_c == "," and self.get_context() == "object_value": |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c.isspace(): |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
if next_c == "}": |
||||
|
# OK this is valid then |
||||
|
self.log( |
||||
|
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
string_acc += str(char) |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
elif next_c == rstring_delimiter: |
||||
|
if self.get_context() == "object_value": |
||||
|
# But this might not be it! This could be just a missing comma |
||||
|
# We found a delimiter and we need to check if this is a key |
||||
|
# so find a rstring_delimiter and a colon after |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != rstring_delimiter: |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
while next_c and next_c != ":": |
||||
|
if next_c in [ |
||||
|
lstring_delimiter, |
||||
|
rstring_delimiter, |
||||
|
",", |
||||
|
]: |
||||
|
break |
||||
|
i += 1 |
||||
|
next_c = self.get_char_at(i) |
||||
|
# Only if we fail to find a ':' then we know this is misplaced quote |
||||
|
if next_c != ":": |
||||
|
self.log( |
||||
|
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it", |
||||
|
"info", |
||||
|
) |
||||
|
string_acc += str(char) |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
|
||||
|
if ( |
||||
|
char |
||||
|
and missing_quotes |
||||
|
and self.get_context() == "object_key" |
||||
|
and char.isspace() |
||||
|
): |
||||
|
self.log( |
||||
|
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value", |
||||
|
"info", |
||||
|
) |
||||
|
self.skip_whitespaces_at() |
||||
|
if self.get_char_at() not in [":", ","]: |
||||
|
return "" |
||||
|
|
||||
|
# A fallout of the previous special case in the while loop, |
||||
|
# we need to update the index only if we had a closing quote |
||||
|
if char != rstring_delimiter: |
||||
|
self.log( |
||||
|
"While parsing a string, we missed the closing quote, ignoring", |
||||
|
"info", |
||||
|
) |
||||
|
else: |
||||
|
self.index += 1 |
||||
|
|
||||
|
return string_acc.rstrip() |
||||
|
|
||||
|
def parse_number(self) -> Union[float, int, str, JSONReturnType]: |
||||
|
# <number> is a valid real number expressed in one of a number of given formats |
||||
|
number_str = "" |
||||
|
number_chars = set("0123456789-.eE/,") |
||||
|
char = self.get_char_at() |
||||
|
is_array = self.get_context() == "array" |
||||
|
while char and char in number_chars and (char != "," or not is_array): |
||||
|
number_str += char |
||||
|
self.index += 1 |
||||
|
char = self.get_char_at() |
||||
|
if len(number_str) > 1 and number_str[-1] in "-eE/,": |
||||
|
# The number ends with a non valid character for a number/currency, rolling back one |
||||
|
number_str = number_str[:-1] |
||||
|
self.index -= 1 |
||||
|
try: |
||||
|
if "," in number_str: |
||||
|
return str(number_str) |
||||
|
if "." in number_str or "e" in number_str or "E" in number_str: |
||||
|
return float(number_str) |
||||
|
elif number_str == "-": |
||||
|
# If there is a stray "-" this will throw an exception, throw away this character |
||||
|
return self.parse_json() |
||||
|
else: |
||||
|
return int(number_str) |
||||
|
except ValueError: |
||||
|
return number_str |
||||
|
|
||||
|
def parse_boolean_or_null(self) -> Union[bool, str, None]: |
||||
|
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
||||
|
starting_index = self.index |
||||
|
char = (self.get_char_at() or "").lower() |
||||
|
value: Optional[Tuple[str, Optional[bool]]] |
||||
|
if char == "t": |
||||
|
value = ("true", True) |
||||
|
elif char == "f": |
||||
|
value = ("false", False) |
||||
|
elif char == "n": |
||||
|
value = ("null", None) |
||||
|
|
||||
|
if value: |
||||
|
i = 0 |
||||
|
while char and i < len(value[0]) and char == value[0][i]: |
||||
|
i += 1 |
||||
|
self.index += 1 |
||||
|
char = (self.get_char_at() or "").lower() |
||||
|
if i == len(value[0]): |
||||
|
return value[1] |
||||
|
|
||||
|
# If nothing works reset the index before returning |
||||
|
self.index = starting_index |
||||
|
return "" |
||||
|
|
||||
|
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]: |
||||
|
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True |
||||
|
try: |
||||
|
return self.json_str[self.index + count] |
||||
|
except IndexError: |
||||
|
return False |
||||
|
|
||||
|
def skip_whitespaces_at(self) -> None: |
||||
|
""" |
||||
|
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise |
||||
|
""" |
||||
|
try: |
||||
|
char = self.json_str[self.index] |
||||
|
except IndexError: |
||||
|
return |
||||
|
while char.isspace(): |
||||
|
self.index += 1 |
||||
|
try: |
||||
|
char = self.json_str[self.index] |
||||
|
except IndexError: |
||||
|
return |
||||
|
|
||||
|
def set_context(self, value: str) -> None: |
||||
|
# If a value is provided update the context variable and save in stack |
||||
|
if value: |
||||
|
self.context.append(value) |
||||
|
|
||||
|
def reset_context(self) -> None: |
||||
|
self.context.pop() |
||||
|
|
||||
|
def get_context(self) -> str: |
||||
|
return self.context[-1] |
||||
|
|
||||
|
def log(self, text: str, level: str) -> None: |
||||
|
if level == self.logger.log_level: |
||||
|
context = "" |
||||
|
start = max(self.index - self.logger.window, 0) |
||||
|
end = min(self.index + self.logger.window, len(self.json_str)) |
||||
|
context = self.json_str[start:end] |
||||
|
self.logger.log.append( |
||||
|
{ |
||||
|
"text": text, |
||||
|
"context": context, |
||||
|
} |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def repair_json( |
||||
|
json_str: str = "", |
||||
|
return_objects: bool = False, |
||||
|
skip_json_loads: bool = False, |
||||
|
logging: bool = False, |
||||
|
json_fd: Optional[TextIO] = None, |
||||
|
ensure_ascii: bool = True, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it. |
||||
|
It will return the fixed string by default. |
||||
|
When `return_objects=True` is passed, it will return the decoded data structure instead. |
||||
|
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function |
||||
|
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions |
||||
|
""" |
||||
|
parser = JSONParser(json_str, json_fd, logging) |
||||
|
if skip_json_loads: |
||||
|
parsed_json = parser.parse() |
||||
|
else: |
||||
|
try: |
||||
|
if json_fd: |
||||
|
parsed_json = json.load(json_fd) |
||||
|
else: |
||||
|
parsed_json = json.loads(json_str) |
||||
|
except json.JSONDecodeError: |
||||
|
parsed_json = parser.parse() |
||||
|
# It's useful to return the actual object instead of the json string, |
||||
|
# it allows this lib to be a replacement of the json library |
||||
|
if return_objects or logging: |
||||
|
return parsed_json |
||||
|
return json.dumps(parsed_json, ensure_ascii=ensure_ascii) |
||||
|
|
||||
|
|
||||
|
def loads( |
||||
|
json_str: str, |
||||
|
skip_json_loads: bool = False, |
||||
|
logging: bool = False, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
This function works like `json.loads()` except that it will fix your JSON in the process. |
||||
|
It is a wrapper around the `repair_json()` function with `return_objects=True`. |
||||
|
""" |
||||
|
return repair_json( |
||||
|
json_str=json_str, |
||||
|
return_objects=True, |
||||
|
skip_json_loads=skip_json_loads, |
||||
|
logging=logging, |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def load( |
||||
|
fd: TextIO, skip_json_loads: bool = False, logging: bool = False |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
This function works like `json.load()` except that it will fix your JSON in the process. |
||||
|
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`. |
||||
|
""" |
||||
|
return repair_json( |
||||
|
json_fd=fd, |
||||
|
return_objects=True, |
||||
|
skip_json_loads=skip_json_loads, |
||||
|
logging=logging, |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def from_file( |
||||
|
filename: str, |
||||
|
skip_json_loads: bool = False, |
||||
|
logging: bool = False, |
||||
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
||||
|
""" |
||||
|
This function is a wrapper around `load()` so you can pass the filename as string |
||||
|
""" |
||||
|
fd = open(filename) |
||||
|
jsonobj = load(fd, skip_json_loads, logging) |
||||
|
fd.close() |
||||
|
|
||||
|
return jsonobj |
@ -0,0 +1,45 @@ |
|||||
|
from flask import Flask, request, jsonify |
||||
|
import os |
||||
|
from checkPlaceName import checkPlaceName |
||||
|
# from checkRepeatText import checkRepeatText |
||||
|
from checkCompanyName import checkCompanyName |
||||
|
from checkDocumentError import getDocumentError |
||||
|
app = Flask(__name__) |
||||
|
UPLOAD_FOLDER = 'uploads' |
||||
|
if not os.path.exists(UPLOAD_FOLDER): |
||||
|
os.makedirs(UPLOAD_FOLDER) |
||||
|
@app.route('/upload', methods=['POST']) |
||||
|
def upload_file(): |
||||
|
if 'file' not in request.files: |
||||
|
return jsonify({"error": "No file part"}), 400 |
||||
|
file = request.files['file'] |
||||
|
if file.filename == '': |
||||
|
return jsonify({"error": "No selected file"}), 400 |
||||
|
if file: |
||||
|
filename = file.filename |
||||
|
file.save(os.path.join(UPLOAD_FOLDER,filename)) |
||||
|
return jsonify({"message": "File uploaded successfully"}), 200 |
||||
|
|
||||
|
@app.route('/getDocumentError', methods=['GET']) |
||||
|
def getDocumentErrorWeb(): |
||||
|
filename = request.args.get('filename') |
||||
|
return getDocumentError(filename) |
||||
|
@app.route('/checkPlaceName', methods=['GET']) |
||||
|
def checkPlaceNameWeb(): |
||||
|
filename = request.args.get('filename') |
||||
|
return checkPlaceName(filename) |
||||
|
@app.route('/checkRepeatText', methods=['GET']) |
||||
|
def checkRepeatTextWeb(): |
||||
|
filename = request.args.get('filename') |
||||
|
sectionName=request.args.get('sectionName') |
||||
|
return checkRepeatText(filename,sectionName) |
||||
|
@app.route('/checkCompanyName', methods=['GET']) |
||||
|
def checkCompanyNameWeb(): |
||||
|
filename = request.args.get('filename') |
||||
|
return checkCompanyName(filename) |
||||
|
|
||||
|
@app.route('/test/<filename>', methods=['GET']) |
||||
|
def test(filename): |
||||
|
return filename |
||||
|
if __name__ == '__main__': |
||||
|
app.run(host="0.0.0.0",port=80) |