@ -0,0 +1,8 @@ |
|||
# 默认忽略的文件 |
|||
/shelf/ |
|||
/workspace.xml |
|||
# 基于编辑器的 HTTP 客户端请求 |
|||
/httpRequests/ |
|||
# Datasource local storage ignored files |
|||
/dataSources/ |
|||
/dataSources.local.xml |
@ -0,0 +1,6 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="Encoding"> |
|||
<file url="file://$PROJECT_DIR$/ce.txt" charset="GBK" /> |
|||
</component> |
|||
</project> |
@ -0,0 +1,6 @@ |
|||
<component name="InspectionProjectProfileManager"> |
|||
<settings> |
|||
<option name="USE_PROJECT_PROFILE" value="false" /> |
|||
<version value="1.0" /> |
|||
</settings> |
|||
</component> |
@ -0,0 +1,7 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="Black"> |
|||
<option name="sdkName" value="Python 3.9 (venv) (2)" /> |
|||
</component> |
|||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (venv) (2)" project-jdk-type="Python SDK" /> |
|||
</project> |
@ -0,0 +1,8 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ProjectModuleManager"> |
|||
<modules> |
|||
<module fileurl="file://$PROJECT_DIR$/.idea/python项目39.iml" filepath="$PROJECT_DIR$/.idea/python项目39.iml" /> |
|||
</modules> |
|||
</component> |
|||
</project> |
@ -0,0 +1,10 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<module type="PYTHON_MODULE" version="4"> |
|||
<component name="NewModuleRootManager"> |
|||
<content url="file://$MODULE_DIR$"> |
|||
<excludeFolder url="file://$MODULE_DIR$/venv" /> |
|||
</content> |
|||
<orderEntry type="inheritedJdk" /> |
|||
<orderEntry type="sourceFolder" forTests="false" /> |
|||
</component> |
|||
</module> |
@ -0,0 +1,258 @@ |
|||
#!/usr/bin/env python |
|||
# -*- coding: utf-8 -*- |
|||
# Created by Charles on 2018/10/10 |
|||
# Function: |
|||
|
|||
import sys |
|||
import requests |
|||
from bs4 import BeautifulSoup |
|||
|
|||
|
|||
ABSTRACT_MAX_LENGTH = 300 # abstract max length |
|||
|
|||
user_agents = [ |
|||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', |
|||
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', |
|||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)' |
|||
' Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36', |
|||
'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 ' |
|||
'(KHTML, like Gecko) QtWeb Internet Browser/3.7 http://www.QtWeb.net', |
|||
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) ' |
|||
'Chrome/41.0.2228.0 Safari/537.36', |
|||
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, ' |
|||
'like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2', |
|||
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) ' |
|||
'Gecko/20070404 K-Ninja/2.1.3', |
|||
'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; ' |
|||
'en-US) iNet Browser 4.7', |
|||
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', |
|||
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) ' |
|||
'Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866' |
|||
] |
|||
|
|||
# 请求头信息 |
|||
HEADERS = { |
|||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", |
|||
"Content-Type": "application/x-www-form-urlencoded", |
|||
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', |
|||
"Referer": "https://www.baidu.com/", |
|||
"Accept-Encoding": "gzip, deflate", |
|||
"Accept-Language": "zh-CN,zh;q=0.9" |
|||
} |
|||
|
|||
baidu_host_url = "https://www.baidu.com" |
|||
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&wd=" |
|||
|
|||
session = requests.Session() |
|||
session.headers = HEADERS |
|||
|
|||
|
|||
def search(keyword, num_results=10, debug=0): |
|||
""" |
|||
通过关键字进行搜索 |
|||
:param keyword: 关键字 |
|||
:param num_results: 指定返回的结果个数 |
|||
:return: 结果列表 |
|||
""" |
|||
if not keyword: |
|||
return None |
|||
|
|||
list_result = [] |
|||
page = 1 |
|||
|
|||
# 起始搜索的url |
|||
next_url = baidu_search_url + keyword |
|||
|
|||
# 循环遍历每一页的搜索结果,并返回下一页的url |
|||
while len(list_result) < num_results: |
|||
data, next_url = parse_html(next_url, rank_start=len(list_result)) |
|||
if data: |
|||
list_result += data |
|||
if debug: |
|||
print("---searching[{}], finish parsing page {}, results number={}: ".format(keyword, page, len(data))) |
|||
for d in data: |
|||
print(str(d)) |
|||
|
|||
if not next_url: |
|||
if debug: |
|||
print(u"already search the last page。") |
|||
break |
|||
page += 1 |
|||
|
|||
if debug: |
|||
print("\n---search [{}] finished. total results number={}!".format(keyword, len(list_result))) |
|||
return list_result[: num_results] if len(list_result) > num_results else list_result |
|||
|
|||
|
|||
def parse_html(url, rank_start=0, debug=0): |
|||
""" |
|||
解析处理结果 |
|||
:param url: 需要抓取的 url |
|||
:return: 结果列表,下一页的url |
|||
""" |
|||
try: |
|||
res = session.get(url=url) |
|||
res.encoding = "utf-8" |
|||
root = BeautifulSoup(res.text, "lxml") |
|||
|
|||
list_data = [] |
|||
div_contents = root.find("div", id="content_left") |
|||
for div in div_contents.contents: |
|||
if type(div) != type(div_contents): |
|||
continue |
|||
|
|||
class_list = div.get("class", []) |
|||
if not class_list: |
|||
continue |
|||
|
|||
if "c-container" not in class_list: |
|||
continue |
|||
|
|||
title = '' |
|||
url = '' |
|||
abstract = '' |
|||
try: |
|||
# 遍历所有找到的结果,取得标题和概要内容(50字以内) |
|||
if "xpath-log" in class_list: |
|||
if div.h3: |
|||
title = div.h3.text.strip() |
|||
url = div.h3.a['href'].strip() |
|||
else: |
|||
title = div.text.strip().split("\n", 1)[0] |
|||
if div.a: |
|||
url = div.a['href'].strip() |
|||
|
|||
if div.find("div", class_="c-abstract"): |
|||
abstract = div.find("div", class_="c-abstract").text.strip() |
|||
elif div.div: |
|||
abstract = div.div.text.strip() |
|||
else: |
|||
abstract = div.text.strip().split("\n", 1)[1].strip() |
|||
elif "result-op" in class_list: |
|||
if div.h3: |
|||
title = div.h3.text.strip() |
|||
url = div.h3.a['href'].strip() |
|||
else: |
|||
title = div.text.strip().split("\n", 1)[0] |
|||
url = div.a['href'].strip() |
|||
if div.find("div", class_="c-abstract"): |
|||
abstract = div.find("div", class_="c-abstract").text.strip() |
|||
elif div.div: |
|||
abstract = div.div.text.strip() |
|||
else: |
|||
# abstract = div.text.strip() |
|||
abstract = div.text.strip().split("\n", 1)[1].strip() |
|||
else: |
|||
if div.get("tpl", "") != "se_com_default": |
|||
if div.get("tpl", "") == "se_st_com_abstract": |
|||
if len(div.contents) >= 1: |
|||
title = div.h3.text.strip() |
|||
if div.find("div", class_="c-abstract"): |
|||
abstract = div.find("div", class_="c-abstract").text.strip() |
|||
elif div.div: |
|||
abstract = div.div.text.strip() |
|||
else: |
|||
abstract = div.text.strip() |
|||
else: |
|||
if len(div.contents) >= 2: |
|||
if div.h3: |
|||
title = div.h3.text.strip() |
|||
url = div.h3.a['href'].strip() |
|||
else: |
|||
title = div.contents[0].text.strip() |
|||
url = div.h3.a['href'].strip() |
|||
# abstract = div.contents[-1].text |
|||
if div.find("div", class_="c-abstract"): |
|||
abstract = div.find("div", class_="c-abstract").text.strip() |
|||
elif div.div: |
|||
abstract = div.div.text.strip() |
|||
else: |
|||
abstract = div.text.strip() |
|||
else: |
|||
if div.h3: |
|||
title = div.h3.text.strip() |
|||
url = div.h3.a['href'].strip() |
|||
else: |
|||
title = div.contents[0].text.strip() |
|||
url = div.h3.a['href'].strip() |
|||
if div.find("div", class_="c-abstract"): |
|||
abstract = div.find("div", class_="c-abstract").text.strip() |
|||
elif div.div: |
|||
abstract = div.div.text.strip() |
|||
else: |
|||
abstract = div.text.strip() |
|||
except Exception as e: |
|||
if debug: |
|||
print("catch exception duration parsing page html, e={}".format(e)) |
|||
continue |
|||
|
|||
if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH: |
|||
abstract = abstract[:ABSTRACT_MAX_LENGTH] |
|||
|
|||
rank_start+=1 |
|||
list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start}) |
|||
|
|||
|
|||
# 找到下一页按钮 |
|||
next_btn = root.find_all("a", class_="n") |
|||
|
|||
# 已经是最后一页了,没有下一页了,此时只返回数据不再获取下一页的链接 |
|||
if len(next_btn) <= 0 or u"上一页" in next_btn[-1].text: |
|||
return list_data, None |
|||
|
|||
next_url = baidu_host_url + next_btn[-1]["href"] |
|||
return list_data, next_url |
|||
except Exception as e: |
|||
if debug: |
|||
print(u"catch exception duration parsing page html, e:{}".format(e)) |
|||
return None, None |
|||
|
|||
|
|||
def run(): |
|||
""" |
|||
主程序入口,支持命令得带参执行或者手动输入关键字 |
|||
:return: |
|||
""" |
|||
default_keyword = u"长风破浪小武哥" |
|||
num_results = 10 |
|||
debug = 0 |
|||
|
|||
prompt = """ |
|||
baidusearch: not enough arguments |
|||
[0]keyword: keyword what you want to search |
|||
[1]num_results: number of results |
|||
[2]debug: debug switch, 0-close, 1-open, default-0 |
|||
eg: baidusearch NBA |
|||
baidusearch NBA 6 |
|||
baidusearch NBA 8 1 |
|||
""" |
|||
if len(sys.argv) > 3: |
|||
keyword = sys.argv[1] |
|||
try: |
|||
num_results = int(sys.argv[2]) |
|||
debug = int(sys.argv[3]) |
|||
except: |
|||
pass |
|||
elif len(sys.argv) > 1: |
|||
keyword = sys.argv[1] |
|||
else: |
|||
print(prompt) |
|||
keyword = input("please input keyword: ") |
|||
# sys.exit(1) |
|||
|
|||
if not keyword: |
|||
keyword = default_keyword |
|||
|
|||
print("---start search: [{}], expected number of results:[{}].".format(keyword, num_results)) |
|||
results = search(keyword, num_results=num_results, debug=debug) |
|||
|
|||
if isinstance(results, list): |
|||
print("search results:(total[{}]items.)".format(len(results))) |
|||
for res in results: |
|||
print("{}. {}\n {}\n {}".format(res['rank'], res["title"], res["abstract"], res["url"])) |
|||
else: |
|||
print("start search: [{}] failed.".format(keyword)) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
run() |
@ -0,0 +1,64 @@ |
|||
from qwen_agent.agents import Assistant |
|||
# from qwen_agent.agents.doc_qa import ParallelDocQA |
|||
|
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
|||
) |
|||
prompt=''' |
|||
请找是描述项目建设的章节名称 |
|||
''' |
|||
messages = [{'role': 'user', 'content': [{'text': prompt}, {'file': ''}]}] |
|||
for rsp in bot.run(messages): |
|||
print(rsp) |
|||
# messages = [{'role': 'user', 'content': [{'text':prompt}]}] |
|||
# runList=[] |
|||
# for rsp in bot.run(messages): |
|||
# print(rsp) |
|||
import re |
|||
# from docx import Document |
|||
# |
|||
# document = Document('747991ddb29a49da903210959076bb9f.docx') |
|||
# # 逐段读取docx文档的内容 |
|||
# levelList = [] |
|||
# words = [] |
|||
# addStart = False |
|||
# levelText = "" |
|||
# i = 0 |
|||
# for paragraph in document.paragraphs: |
|||
# # 判断该段落的标题级别 |
|||
# # 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
# text = paragraph.text |
|||
# if text.strip(): # 非空判断 |
|||
# # print("非空") |
|||
# words.append(text) |
|||
# # level = isTitle(paragraph) |
|||
# # if(addStart and level=="0"): |
|||
# # addStart=False |
|||
# # if(level=="0" and text.find("详细设计方案")>=0): |
|||
# # addStart=True |
|||
# # if level: |
|||
# # levelList.append("{}:".format(level)+paragraph.text) |
|||
# # levelText=text |
|||
# # else: |
|||
# # if addStart: |
|||
# # if(text.startswith("图") or text.startswith("注:")): |
|||
# # continue |
|||
# # i=i+1 |
|||
# # words.append("第{}个段落:".format(i)+text) |
|||
# |
|||
# # 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
# print(len(words)) |
|||
# text = '\n'.join(words) |
|||
# paragraphs = re.findall(r'.*?' + re.escape('宁波市') + r'.*?\n', text) |
|||
# print(paragraphs) |
|||
from langchain_community.document_loaders import TextLoader |
|||
|
|||
loader = TextLoader('checkRepeatText.txt') |
|||
docs = loader.load() |
@ -0,0 +1,205 @@ |
|||
# -*- coding:utf-8 -*- |
|||
import time |
|||
from docx import Document |
|||
from paddlenlp import Taskflow |
|||
from qwen_agent.agents import Assistant |
|||
import re |
|||
import json_repair |
|||
import math |
|||
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
|||
from docx.opc.oxml import parse_xml |
|||
|
|||
|
|||
def load_from_xml_v2(baseURI, rels_item_xml): |
|||
""" |
|||
Return |_SerializedRelationships| instance loaded with the |
|||
relationships contained in *rels_item_xml*. Returns an empty |
|||
collection if *rels_item_xml* is |None|. |
|||
""" |
|||
srels = _SerializedRelationships() |
|||
if rels_item_xml is not None: |
|||
rels_elm = parse_xml(rels_item_xml) |
|||
for rel_elm in rels_elm.Relationship_lst: |
|||
if rel_elm.target_ref in ('../NULL', 'NULL'): |
|||
continue |
|||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
|||
return srels |
|||
|
|||
|
|||
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
|||
|
|||
|
|||
import logging |
|||
import logging.config |
|||
|
|||
log_config = { |
|||
'version': 1, |
|||
'disable_existing_loggers': False, |
|||
'formatters': { |
|||
'standard': { |
|||
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|||
}, |
|||
}, |
|||
'handlers': { |
|||
'console': { |
|||
'class': 'logging.StreamHandler', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
'file': { |
|||
'class': 'logging.FileHandler', |
|||
'filename': 'Logger.log', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
}, |
|||
'loggers': { |
|||
'': { |
|||
'handlers': ['console', 'file'], |
|||
'level': logging.INFO, |
|||
'propagate': True, |
|||
}, |
|||
} |
|||
} |
|||
|
|||
logging.config.dictConfig(log_config) |
|||
|
|||
logger = logging.getLogger("checkCompanyName") |
|||
prompt = ''' |
|||
.根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询, |
|||
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案, |
|||
回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
''' |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具" |
|||
) |
|||
|
|||
def getDocxToTextAll(name): |
|||
docxPath=name |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
# print("非空") |
|||
words.append(text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file: |
|||
txt_file.write(text) |
|||
def companyNameTask(text): |
|||
yield "文档公司或组织名称检查---启动中...." |
|||
wordtag = Taskflow("knowledge_mining",device_id=0) |
|||
batchNum=20 |
|||
sentences = re.split(r'[。\n]', text) |
|||
# 去掉空字符 |
|||
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
# 计算总字符数 |
|||
total_chars = len(sentences) |
|||
|
|||
# 计算有多少份 |
|||
num_chunks = math.ceil(total_chars / batchNum) |
|||
|
|||
# 按batchNum字为一份进行处理 |
|||
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
|||
placeList = [] |
|||
# 打印每一份的内容 |
|||
for i, chunk in enumerate(chunks): |
|||
yield f"文档公司或组织名称检查---文档解析进度:{i + 1}/{num_chunks}" |
|||
|
|||
wenBen=".".join(chunk) |
|||
try: |
|||
res = wordtag(wenBen) |
|||
except Exception as e: |
|||
logging.warning(chunk) |
|||
logging.warning("文档公司或组织名称检查---词类分析出错",e) |
|||
continue |
|||
isplace = False |
|||
for zuhe in res[0]['items']: |
|||
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
|||
zhi = zuhe.get("wordtag_label") |
|||
if isplace: |
|||
name = placeList[len(placeList) - 1] |
|||
if zhi.find("组织机构类") >= 0: # or zuhe[1] == "ns" |
|||
isplace = True |
|||
new_text = zuhe['item'].replace("\n", "") |
|||
placeList[len(placeList) - 1] = name + new_text |
|||
continue |
|||
if zhi.find("组织机构类") >= 0: |
|||
isplace = True |
|||
new_text = zuhe['item'].replace("\n", "") |
|||
placeList.append(new_text) |
|||
else: |
|||
isplace = False |
|||
# 打印总份数 |
|||
yield "文档公司或组织名称检查---文档解析完成" |
|||
placeList=list(dict.fromkeys(placeList)) |
|||
yield placeList |
|||
def checkCompanyName(filename): |
|||
yield f"文档公司或组织名称检查---开始处理文档..." |
|||
try: |
|||
getDocxToTextAll(filename) |
|||
except Exception as e: |
|||
logging.warning(e) |
|||
yield "文档公司或组织名称检查---文档无法打开,请检查文档内容" |
|||
return |
|||
with open("checkCompanyName.txt", "r", encoding='utf-8') as f: |
|||
gettext = f.read() |
|||
yield f"文档公司或组织名称检查---开始解析文档..." # 每次生成一个数字就发送 |
|||
for item in companyNameTask(gettext): |
|||
if isinstance(item, str): |
|||
yield item |
|||
else: |
|||
final_list = item # 获取最终结果 |
|||
propnStr = ",".join(final_list) |
|||
messages = [{'role': 'user', 'content': [{'text': propnStr+prompt}]}] |
|||
runList = [] |
|||
yield f"文档公司或组织名称检查---结果生成中..." # 每次生成一个数字就发送 |
|||
cishu = 0 |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
if cishu > 3: |
|||
cishu = 0 |
|||
yield "文档公司或组织名称检查---结果生成中" + '.' * cishu |
|||
cishu += 1 |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
error_places=[] |
|||
for place in parsed_data: |
|||
try: |
|||
if place['回答'] == '非泛化的公司或组织名称': |
|||
error_places.append(place) |
|||
except Exception as e: |
|||
logging.warning(place) |
|||
logging.warning("文档公司或组织名称检查---组织提出出错",e) |
|||
continue |
|||
logging.info(error_places) |
|||
returnInfo = "发现异常公司或组织名称<br>" |
|||
if len(error_places)>0: |
|||
for t in error_places: |
|||
keyword= t['companyName'].replace("\n","") |
|||
# 查找包含关键字的段落 |
|||
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
|||
t["yuanwen"]=paragraphs[0] |
|||
yuanwen = paragraphs[0].replace(keyword, f"**{keyword}**").replace("\n","") |
|||
returnInfo += "原文:" + yuanwen + "<br>异常公司或组织名称:**" + keyword + "**!请注意" + "<br>" |
|||
logging.info(returnInfo) |
|||
yield returnInfo |
|||
else: |
|||
yield "**未发现异常公司或组织名称**<br>" |
@ -0,0 +1,220 @@ |
|||
# -*- coding:utf-8 -*- |
|||
# from pycorrector import MacBertCorrector |
|||
# m = MacBertCorrector("shibing624/macbert4csc-base-chinese") |
|||
from qwen_agent.agents import Assistant |
|||
from docx import Document |
|||
from pprint import pprint |
|||
import re |
|||
from paddlenlp import Taskflow |
|||
import json |
|||
import time |
|||
import json_repair |
|||
import math |
|||
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
|||
from docx.opc.oxml import parse_xml |
|||
|
|||
import asyncio |
|||
def load_from_xml_v2(baseURI, rels_item_xml): |
|||
""" |
|||
Return |_SerializedRelationships| instance loaded with the |
|||
relationships contained in *rels_item_xml*. Returns an empty |
|||
collection if *rels_item_xml* is |None|. |
|||
""" |
|||
srels = _SerializedRelationships() |
|||
if rels_item_xml is not None: |
|||
rels_elm = parse_xml(rels_item_xml) |
|||
for rel_elm in rels_elm.Relationship_lst: |
|||
if rel_elm.target_ref in ('../NULL', 'NULL'): |
|||
continue |
|||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
|||
return srels |
|||
|
|||
|
|||
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
|||
import logging |
|||
import logging.config |
|||
|
|||
log_config = { |
|||
'version': 1, |
|||
'disable_existing_loggers': False, |
|||
'formatters': { |
|||
'standard': { |
|||
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|||
}, |
|||
}, |
|||
'handlers': { |
|||
'console': { |
|||
'class': 'logging.StreamHandler', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
'file': { |
|||
'class': 'logging.FileHandler', |
|||
'filename': 'Logger.log', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
}, |
|||
'loggers': { |
|||
'': { |
|||
'handlers': ['console', 'file'], |
|||
'level': logging.INFO, |
|||
'propagate': True, |
|||
}, |
|||
} |
|||
} |
|||
|
|||
logging.config.dictConfig(log_config) |
|||
|
|||
logger = logging.getLogger("checkDocumentError") |
|||
llm_cfg = { |
|||
# 'model': 'qwen1.5-72b-chat', |
|||
'model': "qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
|||
|
|||
) |
|||
# prompt=''' |
|||
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案, |
|||
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
# ''' |
|||
prompt = ''' |
|||
请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出解析,没有错则不用给解析 |
|||
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","解析","解析内容"},{"placeName":"序号","回答":"答案","解析","解析内容"}],不做过多的解释,严格按回答格式作答; |
|||
''' |
|||
|
|||
|
|||
def getDocxToTextAll(name): |
|||
docxPath = name |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList = [] |
|||
words = [] |
|||
addStart = False |
|||
levelText = "" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip(): # 非空判断 |
|||
# print("非空") |
|||
words.append(text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file: |
|||
txt_file.write(text) |
|||
|
|||
|
|||
def getDocumentError(filename): |
|||
yield f"文档纠错---开始处理文档..." |
|||
try: |
|||
getDocxToTextAll(filename) |
|||
except Exception as e: |
|||
logger.warning(e) |
|||
yield "文档无法打开,请检查文档内容" |
|||
return |
|||
with open("checkDocumentError.txt", "r", encoding='utf-8') as f: |
|||
gettext = f.read() |
|||
yield f"文档纠错---开始解析文档..." # 每次生成一个数字就发送 |
|||
final_list = [] |
|||
for item in documentErrorTask(gettext): |
|||
if isinstance(item, str): |
|||
yield item |
|||
else: |
|||
final_list = item # 获取最终结果 |
|||
resInfo = "发现错别字<br>" |
|||
if (len(final_list) > 0): |
|||
for i in final_list: |
|||
yuanwen = i["placeName"].replace("\n", "") |
|||
jianyi = i["jianyi"].replace("\n", "") |
|||
resInfo += "原文:" + yuanwen + "<br>建议:**" + jianyi + "**<br>" |
|||
yield resInfo |
|||
logger.info(resInfo) |
|||
else: |
|||
yield "**未发现错别字**" |
|||
|
|||
|
|||
def documentErrorTask(text): |
|||
""" |
|||
分批读取文本文件 |
|||
:param file_path: 文件路径 |
|||
:param batch_size: 每批处理的字符数 |
|||
:return: 生成器,每次返回一批文本 |
|||
""" |
|||
yield "文档纠错---启动中...." |
|||
corrector = Taskflow("text_correction", device_id=1) |
|||
batchNum = 20 |
|||
sentences = re.split(r'[。\n]', text) |
|||
# 去掉空字符 |
|||
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
# 计算总字符数 |
|||
total_chars = len(sentences) |
|||
|
|||
# 计算有多少份 |
|||
num_chunks = math.ceil(total_chars / batchNum) |
|||
|
|||
# 按batchNum字为一份进行处理 |
|||
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
|||
placeList = [] |
|||
# 打印每一份的内容 |
|||
err = [] |
|||
for i, chunk in enumerate(chunks): |
|||
yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}" |
|||
try: |
|||
res = corrector(chunk) |
|||
except Exception as e: |
|||
logger.warning(chunk) |
|||
logger.warning("文档纠错--错别字识别出错\n", e) |
|||
continue |
|||
lines_with_greeting = [place for place in res if len(place['errors']) > 0] |
|||
if len(lines_with_greeting) > 0: |
|||
num = 0 |
|||
wenti = [] # 记录问题的数组 |
|||
keyword_list = [] # 记录问题 |
|||
for t in lines_with_greeting: |
|||
temp_errorWords = [] |
|||
keyword = t['source'] |
|||
keyword_list.append(keyword) |
|||
for item in t["errors"]: |
|||
for key, value in item['correction'].items(): |
|||
temp_errorWords.append(key) |
|||
wenti.append( |
|||
"{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords))) |
|||
num += 1 |
|||
words = "\n".join(wenti) |
|||
messages = [{'role': 'user', 'content': [{'text': words + prompt}]}] |
|||
runList = [] |
|||
yield f"文档纠错---内容解析中..." # 每次生成一个数字就发送 |
|||
cishu = 0 |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
if cishu > 3: |
|||
cishu = 0 |
|||
yield "文档纠错---内容解析中" + '.' * cishu |
|||
cishu += 1 |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
parsed_data = json_repair.loads(data.replace("\\", "").replace('`', '')) |
|||
resListerr = [] |
|||
for place in parsed_data: |
|||
try: |
|||
if place['回答'] == '是': |
|||
place["placeName"] = keyword_list[int(place["placeName"])] |
|||
place["jianyi"] = place["解析"] |
|||
resListerr.append(place) |
|||
except Exception as e: |
|||
logger.warning(parsed_data) |
|||
logger.warning(place) |
|||
logger.warning("文档纠错--错别字提取出错\n", e) |
|||
continue |
|||
if (len(resListerr) > 0): |
|||
err.extend(resListerr) |
|||
# 打印总份数 |
|||
yield "文档地名检查---文档解析完成" |
|||
yield err |
@ -0,0 +1,212 @@ |
|||
from docx import Document |
|||
from paddlenlp import Taskflow |
|||
from pprint import pprint |
|||
from qwen_agent.agents import Assistant |
|||
import re |
|||
import json_repair |
|||
import time |
|||
import math |
|||
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
|||
from docx.opc.oxml import parse_xml |
|||
|
|||
|
|||
def load_from_xml_v2(baseURI, rels_item_xml): |
|||
""" |
|||
Return |_SerializedRelationships| instance loaded with the |
|||
relationships contained in *rels_item_xml*. Returns an empty |
|||
collection if *rels_item_xml* is |None|. |
|||
""" |
|||
srels = _SerializedRelationships() |
|||
if rels_item_xml is not None: |
|||
rels_elm = parse_xml(rels_item_xml) |
|||
for rel_elm in rels_elm.Relationship_lst: |
|||
if rel_elm.target_ref in ('../NULL', 'NULL'): |
|||
continue |
|||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
|||
return srels |
|||
|
|||
|
|||
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
|||
|
|||
|
|||
import logging |
|||
import logging.config |
|||
|
|||
log_config = { |
|||
'version': 1, |
|||
'disable_existing_loggers': False, |
|||
'formatters': { |
|||
'standard': { |
|||
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|||
}, |
|||
}, |
|||
'handlers': { |
|||
'console': { |
|||
'class': 'logging.StreamHandler', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
'file': { |
|||
'class': 'logging.FileHandler', |
|||
'filename': 'Logger.log', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
}, |
|||
'loggers': { |
|||
'': { |
|||
'handlers': ['console', 'file'], |
|||
'level': logging.INFO, |
|||
'propagate': True, |
|||
}, |
|||
} |
|||
} |
|||
|
|||
logging.config.dictConfig(log_config) |
|||
|
|||
logger = logging.getLogger("checkPlaceName") |
|||
|
|||
prompt=''' |
|||
.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
不做过多的解释,严格按回答格式作答; |
|||
''' |
|||
# prompt=''' |
|||
# .请回答以上问题, |
|||
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
# 不做过多的解释,严格按回答格式作答; |
|||
# ''' |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
|||
) |
|||
#获取全文内容 |
|||
def getDocxToTextAll(docxPath): |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
# print("非空") |
|||
words.append(text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file: |
|||
txt_file.write(text) |
|||
|
|||
#得到全文和地名有关的内容 |
|||
def placeNameTask(text): |
|||
yield "文档地名检查---启动中...." |
|||
tagTask = Taskflow("ner",device_id=2) |
|||
batchNum=20 |
|||
sentences = re.split(r'[。\n]', text) |
|||
# 去掉空字符 |
|||
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
# 计算总字符数 |
|||
total_chars = len(sentences) |
|||
|
|||
# 计算有多少份 |
|||
num_chunks = math.ceil(total_chars / batchNum) |
|||
|
|||
# 按batchNum字为一份进行处理 |
|||
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
|||
placeList = [] |
|||
# 打印每一份的内容 |
|||
for i, chunk in enumerate(chunks): |
|||
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}" |
|||
|
|||
wenBen=".".join(chunk) |
|||
try: |
|||
res = tagTask(wenBen) |
|||
except Exception as e: |
|||
logger.warning(chunk) |
|||
logger.warning("文档地名检查---解析地名出错",e) |
|||
continue |
|||
isplace = False |
|||
for zuhe in res: |
|||
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
|||
if isplace: |
|||
name = placeList[len(placeList) - 1] |
|||
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: # or zuhe[1] == "ns" |
|||
isplace = True |
|||
new_text = zuhe[0].replace("\n", "") |
|||
placeList[len(placeList) - 1] = name + new_text |
|||
continue |
|||
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: |
|||
isplace = True |
|||
new_text = zuhe[0].replace("\n", "") |
|||
placeList.append(new_text) |
|||
else: |
|||
isplace = False |
|||
# 打印总份数 |
|||
yield "文档地名检查---文档解析完成" |
|||
placeList=list(dict.fromkeys(placeList)) |
|||
yield placeList |
|||
#主方法 |
|||
def checkPlaceName(filename): |
|||
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送 |
|||
try: |
|||
getDocxToTextAll(filename) |
|||
except Exception as e: |
|||
logger.warning(e) |
|||
yield "文档地名检查---文档无法打开,请检查文档内容" |
|||
return |
|||
with open("checkPlaceName.txt", "r",encoding='utf-8') as f: |
|||
gettext = f.read() |
|||
yield f"文档地名检查---开始解析文档..." # 每次生成一个数字就发送 |
|||
# propnList=placeNameTask(gettext) |
|||
for item in placeNameTask(gettext): |
|||
if isinstance(item, str): |
|||
yield item |
|||
else: |
|||
final_list = item # 获取最终结果 |
|||
propnStr = ",".join(final_list) |
|||
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}] |
|||
runList = [] |
|||
yield f"文档地名检查---结果生成中..." # 每次生成一个数字就发送 |
|||
cishu=0 |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
if cishu>3: |
|||
cishu=0 |
|||
yield "文档地名检查---结果生成中"+'.'*cishu |
|||
cishu+=1 |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
error_places=[] |
|||
# 如果需要进一步操作,例如只关注“正确”的回答 |
|||
for place in parsed_data: |
|||
try: |
|||
if place['回答'] == '错误': |
|||
error_places.append(place) |
|||
except Exception as e: |
|||
logger.warning(place) |
|||
logger.warning("文档地名检查---组织提出出错",e) |
|||
continue |
|||
logger.info(error_places) |
|||
returnInfo = "发现异常地名<br>" |
|||
if len(error_places)>0: |
|||
for t in error_places: |
|||
keyword= t['placeName'].replace("\n","") |
|||
# 查找包含关键字的段落 |
|||
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
|||
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","") |
|||
returnInfo+="原文:" + yuanwen + "<br>出现异常地名:**" + keyword + "**!请注意" + "<br>" |
|||
yield returnInfo |
|||
logger.info(returnInfo) |
|||
else: |
|||
yield "**未发现发现异常地名**" |
@ -0,0 +1,292 @@ |
|||
import uuid |
|||
from langchain_chroma import Chroma |
|||
from langchain_community.embeddings import DashScopeEmbeddings |
|||
from langchain_community.document_loaders import TextLoader |
|||
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|||
from qwen_agent.agents import Assistant |
|||
import json_repair |
|||
from paddlenlp import Taskflow |
|||
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") |
|||
device_id=0 |
|||
import re |
|||
import time |
|||
from docx import Document |
|||
import shutil |
|||
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
|||
from docx.opc.oxml import parse_xml |
|||
import logging |
|||
import logging.config |
|||
|
|||
log_config = { |
|||
'version': 1, |
|||
'disable_existing_loggers': False, |
|||
'formatters': { |
|||
'standard': { |
|||
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|||
}, |
|||
}, |
|||
'handlers': { |
|||
'console': { |
|||
'class': 'logging.StreamHandler', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
'file': { |
|||
'class': 'logging.FileHandler', |
|||
'filename': 'Logger.log', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
}, |
|||
'loggers': { |
|||
'': { |
|||
'handlers': ['console', 'file'], |
|||
'level': logging.INFO, |
|||
'propagate': True, |
|||
}, |
|||
} |
|||
} |
|||
|
|||
logging.config.dictConfig(log_config) |
|||
|
|||
logger = logging.getLogger("checkRepeatText") |
|||
|
|||
def load_from_xml_v2(baseURI, rels_item_xml): |
|||
""" |
|||
Return |_SerializedRelationships| instance loaded with the |
|||
relationships contained in *rels_item_xml*. Returns an empty |
|||
collection if *rels_item_xml* is |None|. |
|||
""" |
|||
srels = _SerializedRelationships() |
|||
if rels_item_xml is not None: |
|||
rels_elm = parse_xml(rels_item_xml) |
|||
for rel_elm in rels_elm.Relationship_lst: |
|||
if rel_elm.target_ref in ('../NULL', 'NULL'): |
|||
continue |
|||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
|||
return srels |
|||
|
|||
|
|||
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
|||
# 记录程序开始的时间戳 |
|||
def getOutlineLevel(inputXml): |
|||
""" |
|||
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
|||
参数 inputXml |
|||
返回 number |
|||
""" |
|||
start_index = inputXml.find('<w:outlineLvl') |
|||
end_index = inputXml.find('>', start_index) |
|||
number = inputXml[start_index:end_index + 1] |
|||
number = re.search("\d+", number).group() |
|||
return number |
|||
|
|||
|
|||
def isTitle(paragraph): |
|||
""" |
|||
功能 判断该段落是否设置了大纲等级 |
|||
参数 paragraph:段落 |
|||
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
|||
""" |
|||
# 如果是空行,直接返回None |
|||
if paragraph.text.strip() == '': |
|||
return None |
|||
|
|||
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
|||
paragraphXml = paragraph._p.xml |
|||
if paragraphXml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(paragraphXml) |
|||
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
|||
targetStyle = paragraph.style |
|||
while targetStyle is not None: |
|||
# 如果在该级style中找到了大纲级别,返回 |
|||
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(targetStyle.element.xml) |
|||
else: |
|||
targetStyle = targetStyle.base_style |
|||
# 如果在段落、样式里都没有找到大纲级别,返回None |
|||
return None |
|||
|
|||
#寻找标题名称 |
|||
def findTitleName(docxPath): |
|||
yield '文档相似性检查----检查是否存在详细设计方案' |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
titleWords=[] |
|||
firstTitle = 0 |
|||
secondTitle = 0 |
|||
sanjiTitle = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
level = isTitle(paragraph) |
|||
if level=="0": |
|||
firstTitle+=1 |
|||
secondTitle = 0 |
|||
if(text.find("附件")>=0): |
|||
continue |
|||
titleWords.append("一级标题:".format(firstTitle)+text) |
|||
elif level=="1": |
|||
secondTitle+=1 |
|||
sanjiTitle=0 |
|||
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
|||
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text) |
|||
elif level=="2": |
|||
sanjiTitle += 1 |
|||
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
|||
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) |
|||
findTitleName_llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, |
|||
name='Assistant', |
|||
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' |
|||
) |
|||
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 |
|||
类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择 |
|||
类似设计方案,服务方案,建设方案为次相关,次级选择 |
|||
类似方案是最后选择 |
|||
按照这样的顺序选择最合适的 |
|||
你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答 |
|||
''' |
|||
# print("\n".join(titleWords)+prompt) |
|||
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})] |
|||
runList=[] |
|||
for rsp in findTitleName_bot.run(messages): |
|||
runList.append(rsp) |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
logger.info(parsed_data) |
|||
if(parsed_data["answer"]=="存在"): |
|||
yield parsed_data["name"] |
|||
else: |
|||
yield "文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较" |
|||
#获取文档中 详细设计方案 章节的所有内容 |
|||
def getDocxToText(docxPath,titleName,vector_store_path): |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
if titleName: |
|||
level = isTitle(paragraph) |
|||
if(addStart and level=="0"): |
|||
addStart=False |
|||
if(level=="0" and (titleName.find(text)>=0 or text.find(titleName)>=0)): |
|||
addStart=True |
|||
if level: |
|||
levelList.append("{}:".format(level)+paragraph.text) |
|||
levelText=f"{int(level)+1}级标题-"+text |
|||
else: |
|||
if addStart: |
|||
if(text.startswith("图") or text.startswith("注:")): |
|||
continue |
|||
if(len(text)>30): |
|||
i=i+1 |
|||
words.append("{}:".format(levelText)+text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
if len(words)==0: |
|||
raise Exception("checkRepeatText,获取长度为0") |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkRepeatText.txt", 'w', ) as txt_file: |
|||
txt_file.write(text) |
|||
time.sleep(3) |
|||
loader = TextLoader(file_path='checkRepeatText.txt') |
|||
docs = loader.load() |
|||
# print(docs) |
|||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10, add_start_index=True, |
|||
separators=["\n\n", "\n"]) |
|||
|
|||
splits = text_splitter.split_documents(docs) |
|||
uuids = [] |
|||
for i in range(len(splits)): |
|||
uuids.append(str(uuid.uuid4())) |
|||
logging.info(f"checkRepeatTextuuidLen{len(uuids)}") |
|||
|
|||
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) |
|||
vectorstore.add_documents(documents=splits, ids=uuids) |
|||
while True: |
|||
time.sleep(0.3) |
|||
ress = vectorstore.similarity_search(words[0]) |
|||
if (len(ress) > 0): |
|||
break |
|||
return words,uuids,vectorstore |
|||
|
|||
|
|||
# @app.route('/checkRepeatText/<filename>', methods=['GET']) |
|||
def checkRepeatText(filename): |
|||
yield "文档相似性检查---启动中...." |
|||
vector_store_path="vector_store"+str(uuid.uuid4()) |
|||
for titleName in findTitleName(filename): |
|||
yield titleName |
|||
if(titleName!="文档相似性检查----未找到与详细设计方案相关内容,无法进行相似性比较"): |
|||
try: |
|||
yield "文档相似性检查----文档内容解析中" |
|||
words,uuids,vectorstore=getDocxToText(filename,titleName,vector_store_path) |
|||
except Exception as e: |
|||
yield f"文档相似性检查----文档内容获取失败,未找到**{titleName}**相关内容或文档打开失败" |
|||
return |
|||
# 记录程序开始的时间戳‘ |
|||
global device_id |
|||
similarity = Taskflow("text_similarity",device_id=3) |
|||
# device_id+=1 |
|||
# if(device_id>1): |
|||
# device_id=0 |
|||
reslist = [] |
|||
count = 0 |
|||
for i in words: |
|||
count += 1 |
|||
yield f"文档相似性检查--对{titleName}章节,进行文档内容检查中{count}/{len(words)}" |
|||
result = vectorstore.similarity_search(i) |
|||
textTag = i.split(":")[0] |
|||
for content in result: |
|||
text = content.page_content |
|||
tag = text.split(":")[0].replace('\n', '') |
|||
if (textTag.find(tag) >= 0): |
|||
continue |
|||
try: |
|||
res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) |
|||
except Exception as e: |
|||
logger.warning("文档相似性检查--发生异常:",e) |
|||
logger.warning(i) |
|||
logger.warning(text) |
|||
if (res[0]["similarity"] > 0.90): |
|||
# 判断重复内容是否被放入 |
|||
if (len(reslist) > 0): |
|||
isExist = False |
|||
for neirong in reslist: |
|||
if i in neirong.values(): |
|||
isExist = True |
|||
break |
|||
if not isExist: |
|||
# reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:],"similarity":res[0]["similarity"]}) |
|||
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) |
|||
else: |
|||
reslist.append({"yuanwen1":i.replace("\n",""),"yuanwen2":text.replace("\n",""),"similarity":res[0]["similarity"]}) |
|||
# print(i.split(":")[1] + "\n" + text.split(":")[1]) |
|||
# vectorstore.delete(ids=uuids) |
|||
shutil.rmtree(vector_store_path) |
|||
logger.info("已删除") |
|||
logger.info(reslist) |
|||
resInfo=f"对{titleName}章节,发现相似内容:<br>" |
|||
if(len(reslist)>0): |
|||
for res in reslist: |
|||
resInfo+="【在**"+res["yuanwen1"][:res["yuanwen1"].find(':')]+"**下包含:"+res["yuanwen1"][res["yuanwen1"].find(':') + 1:]+"<br>在**"+res["yuanwen2"][:res["yuanwen2"].find(':')]+"**下包含:"+res["yuanwen2"][res["yuanwen2"].find(':') + 1:]+"<br>以上两段内容***相似度***:"+'{:.2f}'.format(res['similarity'])+"】<br>" |
|||
yield resInfo |
|||
logger.info(resInfo) |
|||
else: |
|||
yield "未发现相似内容" |
@ -0,0 +1,173 @@ |
|||
from docx import Document |
|||
from pprint import pprint |
|||
from qwen_agent.agents import Assistant |
|||
import re |
|||
import json_repair |
|||
import math |
|||
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
|||
from docx.opc.oxml import parse_xml |
|||
|
|||
|
|||
def load_from_xml_v2(baseURI, rels_item_xml): |
|||
""" |
|||
Return |_SerializedRelationships| instance loaded with the |
|||
relationships contained in *rels_item_xml*. Returns an empty |
|||
collection if *rels_item_xml* is |None|. |
|||
""" |
|||
srels = _SerializedRelationships() |
|||
if rels_item_xml is not None: |
|||
rels_elm = parse_xml(rels_item_xml) |
|||
for rel_elm in rels_elm.Relationship_lst: |
|||
if rel_elm.target_ref in ('../NULL', 'NULL'): |
|||
continue |
|||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
|||
return srels |
|||
|
|||
|
|||
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
|||
import logging |
|||
import logging.config |
|||
|
|||
log_config = { |
|||
'version': 1, |
|||
'disable_existing_loggers': False, |
|||
'formatters': { |
|||
'standard': { |
|||
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|||
}, |
|||
}, |
|||
'handlers': { |
|||
'console': { |
|||
'class': 'logging.StreamHandler', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
'file': { |
|||
'class': 'logging.FileHandler', |
|||
'filename': 'Logger.log', |
|||
'formatter': 'standard', |
|||
'level': logging.INFO, |
|||
}, |
|||
}, |
|||
'loggers': { |
|||
'': { |
|||
'handlers': ['console', 'file'], |
|||
'level': logging.INFO, |
|||
'propagate': True, |
|||
}, |
|||
} |
|||
} |
|||
|
|||
logging.config.dictConfig(log_config) |
|||
|
|||
logger = logging.getLogger("checkCompanyName") |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b-instruct", |
|||
'model_server': 'DashScope', # base_url, also known as api_base |
|||
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
) |
|||
|
|||
|
|||
# 记录程序开始的时间戳 |
|||
def getOutlineLevel(inputXml): |
|||
""" |
|||
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
|||
参数 inputXml |
|||
返回 number |
|||
""" |
|||
start_index = inputXml.find('<w:outlineLvl') |
|||
end_index = inputXml.find('>', start_index) |
|||
number = inputXml[start_index:end_index + 1] |
|||
number = re.search("\d+", number).group() |
|||
return number |
|||
|
|||
|
|||
def isTitle(paragraph): |
|||
""" |
|||
功能 判断该段落是否设置了大纲等级 |
|||
参数 paragraph:段落 |
|||
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
|||
""" |
|||
# 如果是空行,直接返回None |
|||
if paragraph.text.strip() == '': |
|||
return None |
|||
|
|||
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
|||
paragraphXml = paragraph._p.xml |
|||
if paragraphXml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(paragraphXml) |
|||
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
|||
targetStyle = paragraph.style |
|||
while targetStyle is not None: |
|||
# 如果在该级style中找到了大纲级别,返回 |
|||
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(targetStyle.element.xml) |
|||
else: |
|||
targetStyle = targetStyle.base_style |
|||
# 如果在段落、样式里都没有找到大纲级别,返回None |
|||
return None |
|||
|
|||
#获取文档中 详细设计方案 章节的所有内容 |
|||
def getDocxToTitleName(docxPath): |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
level = isTitle(paragraph) |
|||
if level=="0": |
|||
words.append(text) |
|||
return words |
|||
|
|||
def checkTitleName(filename): |
|||
|
|||
yield '文档结构检查----启动中' |
|||
with open("ce模板.txt", "r",encoding='utf-8') as f: |
|||
gettext = f.readlines() |
|||
count=0 |
|||
reserr = [] |
|||
try: |
|||
word = getDocxToTitleName(filename) |
|||
except Exception as e: |
|||
print(e) |
|||
yield "文档无法打开,请检查文档内容" |
|||
return |
|||
for text in gettext: |
|||
count+=1 |
|||
prompt = f''' |
|||
\n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 |
|||
''' |
|||
xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" |
|||
yield f"文档结构检查----结构分析中{count}/{len(gettext)}" |
|||
strword = "\n".join(word)+prompt+xushang |
|||
# print(strword) |
|||
messages = [{'role': 'user', 'content': [{'text':strword}]}] |
|||
runList = [] |
|||
cishu = 0 |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
# print(rsp) |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
print(parsed_data) |
|||
if(parsed_data["answer"]=="不存在"): |
|||
reserr.append(text) |
|||
resInfo="文档结构存在异常:<br>" |
|||
if(len(reserr)>0): |
|||
for i in reserr: |
|||
resInfo+="**"+i.replace('\n','')+"**<br>" |
|||
logger.info(resInfo) |
|||
yield resInfo |
|||
else: |
|||
yield "文档结构未发现异常" |
@ -0,0 +1,176 @@ |
|||
from docx import Document |
|||
from pprint import pprint |
|||
from qwen_agent.agents import Assistant |
|||
import re |
|||
import json_repair |
|||
import math |
|||
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
|||
from docx.opc.oxml import parse_xml |
|||
def load_from_xml_v2(baseURI, rels_item_xml): |
|||
""" |
|||
Return |_SerializedRelationships| instance loaded with the |
|||
relationships contained in *rels_item_xml*. Returns an empty |
|||
collection if *rels_item_xml* is |None|. |
|||
""" |
|||
srels = _SerializedRelationships() |
|||
if rels_item_xml is not None: |
|||
rels_elm = parse_xml(rels_item_xml) |
|||
for rel_elm in rels_elm.Relationship_lst: |
|||
if rel_elm.target_ref in ('../NULL', 'NULL'): |
|||
continue |
|||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
|||
return srels |
|||
|
|||
|
|||
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b-instruct", |
|||
'model_server': 'DashScope', # base_url, also known as api_base |
|||
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
) |
|||
|
|||
|
|||
# 记录程序开始的时间戳 |
|||
def getOutlineLevel(inputXml): |
|||
""" |
|||
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
|||
参数 inputXml |
|||
返回 number |
|||
""" |
|||
start_index = inputXml.find('<w:outlineLvl') |
|||
end_index = inputXml.find('>', start_index) |
|||
number = inputXml[start_index:end_index + 1] |
|||
number = re.search("\d+", number).group() |
|||
return number |
|||
|
|||
|
|||
def isTitle(paragraph): |
|||
""" |
|||
功能 判断该段落是否设置了大纲等级 |
|||
参数 paragraph:段落 |
|||
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
|||
""" |
|||
# 如果是空行,直接返回None |
|||
if paragraph.text.strip() == '': |
|||
return None |
|||
|
|||
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
|||
paragraphXml = paragraph._p.xml |
|||
if paragraphXml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(paragraphXml) |
|||
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
|||
targetStyle = paragraph.style |
|||
while targetStyle is not None: |
|||
# 如果在该级style中找到了大纲级别,返回 |
|||
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(targetStyle.element.xml) |
|||
else: |
|||
targetStyle = targetStyle.base_style |
|||
# 如果在段落、样式里都没有找到大纲级别,返回None |
|||
return None |
|||
|
|||
#获取文档中 详细设计方案 章节的所有内容 |
|||
def getDocxToTitleName(docxPath): |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
level = isTitle(paragraph) |
|||
if level=="0": |
|||
words.append(text) |
|||
return words |
|||
|
|||
def checkTitleName(filename): |
|||
prompt = f''' |
|||
\n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 |
|||
''' |
|||
xushang = "回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" |
|||
yield f"文档结构检查----结构分析中{count}/{len(gettext)}" |
|||
strword = "\n".join(word) + prompt + xushang |
|||
# print(strword) |
|||
messages = [{'role': 'user', 'content': [{'text': strword}]}] |
|||
runList = [] |
|||
cishu = 0 |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
# print(rsp) |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
print(parsed_data) |
|||
# yield '文档结构检查----启动中' |
|||
# with open("ce模板.txt", "r",encoding='utf-8') as f: |
|||
# gettext = f.readlines() |
|||
# count=0 |
|||
# reserr = [] |
|||
# try: |
|||
# word = getDocxToTitleName(filename) |
|||
# except Exception as e: |
|||
# print(e) |
|||
# yield "文档无法打开,请检查文档内容" |
|||
# return |
|||
# for text in gettext: |
|||
# count+=1 |
|||
# prompt = f''' |
|||
# \n 这些是文章的标题,请问【{text}】在标题中是否可以配对的,若有请指出是哪个标题,若没有请回到不存在 |
|||
# ''' |
|||
# xushang="回答格式{‘name’:‘名称’,'answer':‘回答’,“标题”:“标题”}请严格按照格式回答问题,不要做过多我解释" |
|||
# yield f"文档结构检查----结构分析中{count}/{len(gettext)}" |
|||
# strword = "\n".join(word)+prompt+xushang |
|||
# # print(strword) |
|||
# messages = [{'role': 'user', 'content': [{'text':strword}]}] |
|||
# runList = [] |
|||
# cishu = 0 |
|||
# for rsp in bot.run(messages): |
|||
# runList.append(rsp) |
|||
# # print(rsp) |
|||
# data = runList[len(runList) - 1][0]["content"] |
|||
# parsed_data = json_repair.loads(data.replace('`', '')) |
|||
# print(parsed_data) |
|||
# if(parsed_data["answer"]=="不存在"): |
|||
# reserr.append(text) |
|||
# resInfo="文档结构存在异常:<br>" |
|||
# if(len(reserr)>0): |
|||
# for i in reserr: |
|||
# resInfo+=f"**{i}**<br>" |
|||
# yield resInfo |
|||
# else: |
|||
# yield "文档结构未发现异常" |
|||
|
|||
|
|||
import logging |
|||
|
|||
# 创建一个记录器 |
|||
logger = logging.getLogger('my_logger') |
|||
logger.setLevel(logging.DEBUG) |
|||
|
|||
# 创建一个处理器 |
|||
ch = logging.StreamHandler() |
|||
ch.setLevel(logging.DEBUG) |
|||
|
|||
# 创建一个格式化器并将其添加到处理器中 |
|||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|||
ch.setFormatter(formatter) |
|||
|
|||
# 将处理器添加到记录器中 |
|||
logger.addHandler(ch) |
|||
try: |
|||
# 记录一些日志消息 |
|||
logger.debug('这是一个调试消息') |
|||
logger.info('这是一个信息消息') |
|||
logger.warning('这是一个警告消息') |
|||
logger.error('这是一个错误消息') |
|||
logger.critical('这是一个致命错误消息') |
|||
except Exception as e: |
|||
logger.warning(e) |
@ -0,0 +1,712 @@ |
|||
""" |
|||
This module will parse the JSON file following the BNF definition: |
|||
|
|||
<json> ::= <container> |
|||
|
|||
<primitive> ::= <number> | <string> | <boolean> |
|||
; Where: |
|||
; <number> is a valid real number expressed in one of a number of given formats |
|||
; <string> is a string of valid characters enclosed in quotes |
|||
; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
|||
|
|||
<container> ::= <object> | <array> |
|||
<array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
|||
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
|||
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value |
|||
|
|||
If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string: |
|||
- Add the missing parentheses if the parser believes that the array or object should be closed |
|||
- Quote strings or add missing single quotes |
|||
- Adjust whitespaces and remove line breaks |
|||
|
|||
All supported use cases are in the unit tests |
|||
""" |
|||
|
|||
import os |
|||
import json |
|||
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal |
|||
|
|||
|
|||
class StringFileWrapper: |
|||
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling |
|||
def __init__(self, fd: TextIO) -> None: |
|||
self.fd = fd |
|||
self.length: int = 0 |
|||
|
|||
def __getitem__(self, index: Union[int, slice]) -> str: |
|||
if isinstance(index, slice): |
|||
self.fd.seek(index.start) |
|||
value = self.fd.read(index.stop - index.start) |
|||
self.fd.seek(index.start) |
|||
return value |
|||
else: |
|||
self.fd.seek(index) |
|||
return self.fd.read(1) |
|||
|
|||
def __len__(self) -> int: |
|||
if self.length < 1: |
|||
current_position = self.fd.tell() |
|||
self.fd.seek(0, os.SEEK_END) |
|||
self.length = self.fd.tell() |
|||
self.fd.seek(current_position) |
|||
return self.length |
|||
|
|||
|
|||
class LoggerConfig: |
|||
# This is a type class to simplify the declaration |
|||
def __init__(self, log_level: Optional[str]): |
|||
self.log: List[Dict[str, str]] = [] |
|||
self.window: int = 10 |
|||
self.log_level: str = log_level if log_level else "none" |
|||
|
|||
|
|||
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None] |
|||
|
|||
|
|||
class JSONParser: |
|||
def __init__( |
|||
self, |
|||
json_str: Union[str, StringFileWrapper], |
|||
json_fd: Optional[TextIO], |
|||
logging: Optional[bool], |
|||
) -> None: |
|||
# The string to parse |
|||
self.json_str = json_str |
|||
# Alternatively, the file description with a json file in it |
|||
if json_fd: |
|||
# This is a trick we do to treat the file wrapper as an array |
|||
self.json_str = StringFileWrapper(json_fd) |
|||
# Index is our iterator that will keep track of which character we are looking at right now |
|||
self.index: int = 0 |
|||
# This is used in the object member parsing to manage the special cases of missing quotes in key or value |
|||
self.context: list[str] = [] |
|||
# Use this to log the activity, but only if logging is active |
|||
self.logger = LoggerConfig(log_level="info" if logging else None) |
|||
|
|||
def parse( |
|||
self, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
json = self.parse_json() |
|||
if self.index < len(self.json_str): |
|||
self.log( |
|||
"The parser returned early, checking if there's more json elements", |
|||
"info", |
|||
) |
|||
json = [json] |
|||
last_index = self.index |
|||
while self.index < len(self.json_str): |
|||
j = self.parse_json() |
|||
if j != "": |
|||
json.append(j) |
|||
if self.index == last_index: |
|||
self.index += 1 |
|||
last_index = self.index |
|||
# If nothing extra was found, don't return an array |
|||
if len(json) == 1: |
|||
self.log( |
|||
"There were no more elements, returning the element without the array", |
|||
"info", |
|||
) |
|||
json = json[0] |
|||
if self.logger.log_level == "none": |
|||
return json |
|||
else: |
|||
return json, self.logger.log |
|||
|
|||
def parse_json( |
|||
self, |
|||
) -> JSONReturnType: |
|||
while True: |
|||
char = self.get_char_at() |
|||
# This parser will ignore any basic element (string or number) that is not inside an array or object |
|||
is_in_context = len(self.context) > 0 |
|||
# False means that we are at the end of the string provided |
|||
if char is False: |
|||
return "" |
|||
# <object> starts with '{' |
|||
elif char == "{": |
|||
self.index += 1 |
|||
return self.parse_object() |
|||
# <array> starts with '[' |
|||
elif char == "[": |
|||
self.index += 1 |
|||
return self.parse_array() |
|||
# there can be an edge case in which a key is empty and at the end of an object |
|||
# like "key": }. We return an empty string here to close the object properly |
|||
elif char == "}": |
|||
self.log( |
|||
"At the end of an object we found a key with missing value, skipping", |
|||
"info", |
|||
) |
|||
return "" |
|||
# <string> starts with a quote |
|||
elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()): |
|||
return self.parse_string() |
|||
# <number> starts with [0-9] or minus |
|||
elif is_in_context and (char.isdigit() or char == "-" or char == "."): |
|||
return self.parse_number() |
|||
# If everything else fails, we just ignore and move on |
|||
else: |
|||
self.index += 1 |
|||
|
|||
def parse_object(self) -> Dict[str, Any]: |
|||
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
|||
obj = {} |
|||
# Stop when you either find the closing parentheses or you have iterated over the entire string |
|||
while (self.get_char_at() or "}") != "}": |
|||
# This is what we expect to find: |
|||
# <member> ::= <string> ': ' <json> |
|||
|
|||
# Skip filler whitespaces |
|||
self.skip_whitespaces_at() |
|||
|
|||
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on |
|||
if (self.get_char_at() or "") == ":": |
|||
self.log( |
|||
"While parsing an object we found a : before a key, ignoring", |
|||
"info", |
|||
) |
|||
self.index += 1 |
|||
|
|||
# We are now searching for they string key |
|||
# Context is used in the string parser to manage the lack of quotes |
|||
self.set_context("object_key") |
|||
|
|||
self.skip_whitespaces_at() |
|||
|
|||
# <member> starts with a <string> |
|||
key = "" |
|||
while self.get_char_at(): |
|||
key = str(self.parse_string()) |
|||
|
|||
if key != "" or (key == "" and self.get_char_at() == ":"): |
|||
# If the string is empty but there is a object divider, we are done here |
|||
break |
|||
|
|||
self.skip_whitespaces_at() |
|||
|
|||
# We reached the end here |
|||
if (self.get_char_at() or "}") == "}": |
|||
continue |
|||
|
|||
self.skip_whitespaces_at() |
|||
|
|||
# An extreme case of missing ":" after a key |
|||
if (self.get_char_at() or "") != ":": |
|||
self.log( |
|||
"While parsing an object we missed a : after a key", |
|||
"info", |
|||
) |
|||
|
|||
self.index += 1 |
|||
self.reset_context() |
|||
self.set_context("object_value") |
|||
# The value can be any valid json |
|||
value = self.parse_json() |
|||
|
|||
# Reset context since our job is done |
|||
self.reset_context() |
|||
obj[key] = value |
|||
|
|||
if (self.get_char_at() or "") in [",", "'", '"']: |
|||
self.index += 1 |
|||
|
|||
# Remove trailing spaces |
|||
self.skip_whitespaces_at() |
|||
|
|||
self.index += 1 |
|||
return obj |
|||
|
|||
def parse_array(self) -> List[Any]: |
|||
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
|||
arr = [] |
|||
self.set_context("array") |
|||
# Stop when you either find the closing parentheses or you have iterated over the entire string |
|||
while (self.get_char_at() or "]") != "]": |
|||
self.skip_whitespaces_at() |
|||
value = self.parse_json() |
|||
|
|||
# It is possible that parse_json() returns nothing valid, so we stop |
|||
if value == "": |
|||
break |
|||
|
|||
if value == "..." and self.get_char_at(-1) == ".": |
|||
self.log( |
|||
"While parsing an array, found a stray '...'; ignoring it", "info" |
|||
) |
|||
else: |
|||
arr.append(value) |
|||
|
|||
# skip over whitespace after a value but before closing ] |
|||
char = self.get_char_at() |
|||
while char and (char.isspace() or char == ","): |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
|
|||
# Especially at the end of an LLM generated json you might miss the last "]" |
|||
char = self.get_char_at() |
|||
if char and char != "]": |
|||
self.log( |
|||
"While parsing an array we missed the closing ], adding it back", "info" |
|||
) |
|||
self.index -= 1 |
|||
|
|||
self.index += 1 |
|||
self.reset_context() |
|||
return arr |
|||
|
|||
def parse_string(self) -> Union[str, bool, None]: |
|||
# <string> is a string of valid characters enclosed in quotes |
|||
# i.e. { name: "John" } |
|||
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here |
|||
|
|||
# Flag to manage corner cases related to missing starting quote |
|||
missing_quotes = False |
|||
doubled_quotes = False |
|||
lstring_delimiter = rstring_delimiter = '"' |
|||
|
|||
char = self.get_char_at() |
|||
# A valid string can only start with a valid quote or, in our case, with a literal |
|||
while char and char not in ['"', "'", "“"] and not char.isalnum(): |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
|
|||
if not char: |
|||
# This is an empty string |
|||
return "" |
|||
|
|||
# Ensuring we use the right delimiter |
|||
if char == "'": |
|||
lstring_delimiter = rstring_delimiter = "'" |
|||
elif char == "“": |
|||
lstring_delimiter = "“" |
|||
rstring_delimiter = "”" |
|||
elif char.isalnum(): |
|||
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid |
|||
# But remember, object keys are only of type string |
|||
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key": |
|||
value = self.parse_boolean_or_null() |
|||
if value != "": |
|||
return value |
|||
self.log( |
|||
"While parsing a string, we found a literal instead of a quote", |
|||
"info", |
|||
) |
|||
self.log( |
|||
"While parsing a string, we found no starting quote. Will add the quote back", |
|||
"info", |
|||
) |
|||
missing_quotes = True |
|||
|
|||
if not missing_quotes: |
|||
self.index += 1 |
|||
|
|||
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop |
|||
if self.get_char_at() == lstring_delimiter: |
|||
# If it's an empty key, this was easy |
|||
if self.get_context() == "object_key" and self.get_char_at(1) == ":": |
|||
self.index += 1 |
|||
return "" |
|||
# Find the next delimiter |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# Now check that the next character is also a delimiter to ensure that we have ""....."" |
|||
# In that case we ignore this rstring delimiter |
|||
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter: |
|||
self.log( |
|||
"While parsing a string, we found a valid starting doubled quote, ignoring it", |
|||
"info", |
|||
) |
|||
doubled_quotes = True |
|||
self.index += 1 |
|||
else: |
|||
# Ok this is not a doubled quote, check if this is an empty string or not |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c not in [",", "]", "}"]: |
|||
self.log( |
|||
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote", |
|||
"info", |
|||
) |
|||
self.index += 1 |
|||
|
|||
# Initialize our return value |
|||
string_acc = "" |
|||
|
|||
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object |
|||
# In that case we need to use the ":|,|}" characters as terminators of the string |
|||
# So this will stop if: |
|||
# * It finds a closing quote |
|||
# * It iterated over the entire sequence |
|||
# * If we are fixing missing quotes in an object, when it finds the special terminators |
|||
char = self.get_char_at() |
|||
while char and char != rstring_delimiter: |
|||
if missing_quotes: |
|||
if self.get_context() == "object_key" and ( |
|||
char == ":" or char.isspace() |
|||
): |
|||
self.log( |
|||
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here", |
|||
"info", |
|||
) |
|||
break |
|||
elif self.get_context() == "object_value" and char in [",", "}"]: |
|||
rstring_delimiter_missing = True |
|||
# check if this is a case in which the closing comma is NOT missing instead |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c and next_c in [",", "}"]: |
|||
rstring_delimiter_missing = False |
|||
if rstring_delimiter_missing: |
|||
self.log( |
|||
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here", |
|||
"info", |
|||
) |
|||
break |
|||
string_acc += char |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
if char and len(string_acc) > 0 and string_acc[-1] == "\\": |
|||
# This is a special case, if people use real strings this might happen |
|||
self.log("Found a stray escape sequence, normalizing it", "info") |
|||
string_acc = string_acc[:-1] |
|||
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]: |
|||
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"} |
|||
string_acc += escape_seqs.get(char, char) or char |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here |
|||
if char == rstring_delimiter: |
|||
# Special case here, in case of double quotes one after another |
|||
if doubled_quotes and self.get_char_at(1) == rstring_delimiter: |
|||
self.log( |
|||
"While parsing a string, we found a doubled quote, ignoring it", |
|||
"info", |
|||
) |
|||
self.index += 1 |
|||
elif missing_quotes and self.get_context() == "object_value": |
|||
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c not in [ |
|||
rstring_delimiter, |
|||
lstring_delimiter, |
|||
]: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c: |
|||
# We found a quote, now let's make sure there's a ":" following |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c and next_c == ":": |
|||
# Reset the cursor |
|||
self.index -= 1 |
|||
char = self.get_char_at() |
|||
self.log( |
|||
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.", |
|||
"info", |
|||
) |
|||
break |
|||
else: |
|||
# Check if eventually there is a rstring delimiter, otherwise we bail |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
check_comma_in_object_value = True |
|||
while next_c and next_c not in [ |
|||
rstring_delimiter, |
|||
lstring_delimiter, |
|||
]: |
|||
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas |
|||
# This is because the routine after will make sure to correct any bad guess and this solves a corner case |
|||
if check_comma_in_object_value and next_c.isalpha(): |
|||
check_comma_in_object_value = False |
|||
# If we are in an object context, let's check for the right delimiters |
|||
if ( |
|||
("object_key" in self.context and next_c in [":", "}"]) |
|||
or ("object_value" in self.context and next_c == "}") |
|||
or ("array" in self.context and next_c in ["]", ","]) |
|||
or ( |
|||
check_comma_in_object_value |
|||
and self.get_context() == "object_value" |
|||
and next_c == "," |
|||
) |
|||
): |
|||
break |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string |
|||
if next_c == "," and self.get_context() == "object_value": |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c == "}": |
|||
# OK this is valid then |
|||
self.log( |
|||
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it", |
|||
"info", |
|||
) |
|||
string_acc += str(char) |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
elif next_c == rstring_delimiter: |
|||
if self.get_context() == "object_value": |
|||
# But this might not be it! This could be just a missing comma |
|||
# We found a delimiter and we need to check if this is a key |
|||
# so find a rstring_delimiter and a colon after |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != ":": |
|||
if next_c in [ |
|||
lstring_delimiter, |
|||
rstring_delimiter, |
|||
",", |
|||
]: |
|||
break |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# Only if we fail to find a ':' then we know this is misplaced quote |
|||
if next_c != ":": |
|||
self.log( |
|||
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it", |
|||
"info", |
|||
) |
|||
string_acc += str(char) |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
|
|||
if ( |
|||
char |
|||
and missing_quotes |
|||
and self.get_context() == "object_key" |
|||
and char.isspace() |
|||
): |
|||
self.log( |
|||
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value", |
|||
"info", |
|||
) |
|||
self.skip_whitespaces_at() |
|||
if self.get_char_at() not in [":", ","]: |
|||
return "" |
|||
|
|||
# A fallout of the previous special case in the while loop, |
|||
# we need to update the index only if we had a closing quote |
|||
if char != rstring_delimiter: |
|||
self.log( |
|||
"While parsing a string, we missed the closing quote, ignoring", |
|||
"info", |
|||
) |
|||
else: |
|||
self.index += 1 |
|||
|
|||
return string_acc.rstrip() |
|||
|
|||
def parse_number(self) -> Union[float, int, str, JSONReturnType]: |
|||
# <number> is a valid real number expressed in one of a number of given formats |
|||
number_str = "" |
|||
number_chars = set("0123456789-.eE/,") |
|||
char = self.get_char_at() |
|||
is_array = self.get_context() == "array" |
|||
while char and char in number_chars and (char != "," or not is_array): |
|||
number_str += char |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
if len(number_str) > 1 and number_str[-1] in "-eE/,": |
|||
# The number ends with a non valid character for a number/currency, rolling back one |
|||
number_str = number_str[:-1] |
|||
self.index -= 1 |
|||
try: |
|||
if "," in number_str: |
|||
return str(number_str) |
|||
if "." in number_str or "e" in number_str or "E" in number_str: |
|||
return float(number_str) |
|||
elif number_str == "-": |
|||
# If there is a stray "-" this will throw an exception, throw away this character |
|||
return self.parse_json() |
|||
else: |
|||
return int(number_str) |
|||
except ValueError: |
|||
return number_str |
|||
|
|||
def parse_boolean_or_null(self) -> Union[bool, str, None]: |
|||
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
|||
starting_index = self.index |
|||
char = (self.get_char_at() or "").lower() |
|||
value: Optional[Tuple[str, Optional[bool]]] |
|||
if char == "t": |
|||
value = ("true", True) |
|||
elif char == "f": |
|||
value = ("false", False) |
|||
elif char == "n": |
|||
value = ("null", None) |
|||
|
|||
if value: |
|||
i = 0 |
|||
while char and i < len(value[0]) and char == value[0][i]: |
|||
i += 1 |
|||
self.index += 1 |
|||
char = (self.get_char_at() or "").lower() |
|||
if i == len(value[0]): |
|||
return value[1] |
|||
|
|||
# If nothing works reset the index before returning |
|||
self.index = starting_index |
|||
return "" |
|||
|
|||
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]: |
|||
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True |
|||
try: |
|||
return self.json_str[self.index + count] |
|||
except IndexError: |
|||
return False |
|||
|
|||
def skip_whitespaces_at(self) -> None: |
|||
""" |
|||
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise |
|||
""" |
|||
try: |
|||
char = self.json_str[self.index] |
|||
except IndexError: |
|||
return |
|||
while char.isspace(): |
|||
self.index += 1 |
|||
try: |
|||
char = self.json_str[self.index] |
|||
except IndexError: |
|||
return |
|||
|
|||
def set_context(self, value: str) -> None: |
|||
# If a value is provided update the context variable and save in stack |
|||
if value: |
|||
self.context.append(value) |
|||
|
|||
def reset_context(self) -> None: |
|||
self.context.pop() |
|||
|
|||
def get_context(self) -> str: |
|||
return self.context[-1] |
|||
|
|||
def log(self, text: str, level: str) -> None: |
|||
if level == self.logger.log_level: |
|||
context = "" |
|||
start = max(self.index - self.logger.window, 0) |
|||
end = min(self.index + self.logger.window, len(self.json_str)) |
|||
context = self.json_str[start:end] |
|||
self.logger.log.append( |
|||
{ |
|||
"text": text, |
|||
"context": context, |
|||
} |
|||
) |
|||
|
|||
|
|||
def repair_json( |
|||
json_str: str = "", |
|||
return_objects: bool = False, |
|||
skip_json_loads: bool = False, |
|||
logging: bool = False, |
|||
json_fd: Optional[TextIO] = None, |
|||
ensure_ascii: bool = True, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it. |
|||
It will return the fixed string by default. |
|||
When `return_objects=True` is passed, it will return the decoded data structure instead. |
|||
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function |
|||
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions |
|||
""" |
|||
parser = JSONParser(json_str, json_fd, logging) |
|||
if skip_json_loads: |
|||
parsed_json = parser.parse() |
|||
else: |
|||
try: |
|||
if json_fd: |
|||
parsed_json = json.load(json_fd) |
|||
else: |
|||
parsed_json = json.loads(json_str) |
|||
except json.JSONDecodeError: |
|||
parsed_json = parser.parse() |
|||
# It's useful to return the actual object instead of the json string, |
|||
# it allows this lib to be a replacement of the json library |
|||
if return_objects or logging: |
|||
return parsed_json |
|||
return json.dumps(parsed_json, ensure_ascii=ensure_ascii) |
|||
|
|||
|
|||
def loads( |
|||
json_str: str, |
|||
skip_json_loads: bool = False, |
|||
logging: bool = False, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
This function works like `json.loads()` except that it will fix your JSON in the process. |
|||
It is a wrapper around the `repair_json()` function with `return_objects=True`. |
|||
""" |
|||
return repair_json( |
|||
json_str=json_str, |
|||
return_objects=True, |
|||
skip_json_loads=skip_json_loads, |
|||
logging=logging, |
|||
) |
|||
|
|||
|
|||
def load( |
|||
fd: TextIO, skip_json_loads: bool = False, logging: bool = False |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
This function works like `json.load()` except that it will fix your JSON in the process. |
|||
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`. |
|||
""" |
|||
return repair_json( |
|||
json_fd=fd, |
|||
return_objects=True, |
|||
skip_json_loads=skip_json_loads, |
|||
logging=logging, |
|||
) |
|||
|
|||
|
|||
def from_file( |
|||
filename: str, |
|||
skip_json_loads: bool = False, |
|||
logging: bool = False, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
This function is a wrapper around `load()` so you can pass the filename as string |
|||
""" |
|||
fd = open(filename) |
|||
jsonobj = load(fd, skip_json_loads, logging) |
|||
fd.close() |
|||
|
|||
return jsonobj |
@ -0,0 +1,161 @@ |
|||
from flask import Flask, request, jsonify,Response |
|||
import os |
|||
from checkPlaceName import checkPlaceName |
|||
from checkRepeatText import checkRepeatText |
|||
from checkCompanyName import checkCompanyName |
|||
from checkDocumentError import getDocumentError |
|||
from checkTitleName import checkTitleName |
|||
from flask_cors import CORS |
|||
import qwen_agenttext |
|||
app = Flask(__name__) |
|||
cros = CORS(app) |
|||
UPLOAD_FOLDER = 'uploads' |
|||
usableTag=[0,0,0,0,0,0,0,0] |
|||
if not os.path.exists(UPLOAD_FOLDER): |
|||
os.makedirs(UPLOAD_FOLDER) |
|||
@app.route('/upload', methods=['POST']) |
|||
def upload_file(): |
|||
if 'file' not in request.files: |
|||
return jsonify({"error": "No file part"}), 400 |
|||
file = request.files['file'] |
|||
if file.filename == '': |
|||
return jsonify({"error": "No selected file"}), 400 |
|||
if file: |
|||
filename = file.filename |
|||
file.save(os.path.join(UPLOAD_FOLDER,filename)) |
|||
return jsonify({"message": "File uploaded successfully"}), 200 |
|||
@app.route('/stream' ,methods=["GET", "POST"]) |
|||
def stream_numbers(): |
|||
context= request.args.get('context') |
|||
# def generate_numbers(): |
|||
# event_id=0 |
|||
# for number in range(1, 10): |
|||
# json_data = json.dumps({"number": number}) |
|||
# print(json_data) |
|||
# event_id += 1 |
|||
# yield f"id: {event_id}\n" |
|||
# yield f"event: time-update\n" |
|||
# yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 |
|||
# time.sleep(0.5) # 为了演示,加入短暂延迟 |
|||
# json_data = json.dumps({"number": "done"}) |
|||
# yield f"id: {1}\n" |
|||
# yield f"event: time-update\n" |
|||
# yield f"data: {json_data}\n\n" # 发送完成信号 |
|||
|
|||
headers = { |
|||
"Content-Type": "text/event-stream", |
|||
"Cache-Control": "no-cache", |
|||
"X-Accel-Buffering": "no", |
|||
"Access-Control-Allow-Origin": "*", |
|||
"Access-Control-Allow-Methods": "GET,POST", |
|||
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
|||
} |
|||
return Response(qwen_agenttext.getxinx(context),headers=headers) |
|||
@app.route('/sse/checkRepeatText', methods=['GET']) |
|||
def checkRepeatTextWeb(): |
|||
filename = request.args.get('filename') |
|||
|
|||
def generate_checkRepeatText(filename): |
|||
id=0 |
|||
try: |
|||
for i in checkRepeatText(filename): |
|||
yield f"id: {id+1}\n" |
|||
yield f"event: checkRepeatText\n" |
|||
yield f"data: {i}\n\n" # 发送完成信号 |
|||
except Exception as e: |
|||
yield f"id: {id+1}\n" |
|||
yield f"event: checkRepeatText\n" |
|||
yield f"data: **程序出现异常**\n\n" # 发送完成信号 |
|||
headers = { |
|||
"Content-Type": "text/event-stream", |
|||
"Cache-Control": "no-cache", |
|||
"X-Accel-Buffering": "no", |
|||
"Access-Control-Allow-Origin": "*", |
|||
"Access-Control-Allow-Methods": "GET,POST", |
|||
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
|||
} |
|||
return Response(generate_checkRepeatText(filename), headers=headers) |
|||
|
|||
|
|||
@app.route('/sse/checkPlaceName', methods=['GET']) |
|||
def checkPlaceNameWebSse(): |
|||
filename = request.args.get('filename') |
|||
|
|||
def generate_checkPlaceName(filename): |
|||
id=0 |
|||
for i in checkPlaceName(filename): |
|||
yield f"id: {id+1}\n" |
|||
yield f"event: checkPlaceName\n" |
|||
yield f"data: {i}\n\n" # 发送完成信号 |
|||
headers = { |
|||
"Content-Type": "text/event-stream", |
|||
"Cache-Control": "no-cache", |
|||
"X-Accel-Buffering": "no", |
|||
"Access-Control-Allow-Origin": "*", |
|||
"Access-Control-Allow-Methods": "GET,POST", |
|||
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
|||
} |
|||
return Response(generate_checkPlaceName(filename), headers=headers) |
|||
@app.route('/sse/checkCompanyName', methods=['GET']) |
|||
def checkCompanyNameWebSse(): |
|||
filename = request.args.get('filename') |
|||
|
|||
def generate_checkCompanyName(filename): |
|||
id = 0 |
|||
for i in checkCompanyName(filename): |
|||
yield f"id: {id + 1}\n" |
|||
yield f"event: checkCompanyName\n" |
|||
yield f"data: {i}\n\n" # 发送完成信号 |
|||
|
|||
headers = { |
|||
"Content-Type": "text/event-stream", |
|||
"Cache-Control": "no-cache", |
|||
"X-Accel-Buffering": "no", |
|||
"Access-Control-Allow-Origin": "*", |
|||
"Access-Control-Allow-Methods": "GET,POST", |
|||
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
|||
} |
|||
return Response(generate_checkCompanyName(filename), headers=headers) |
|||
|
|||
@app.route('/sse/checkDocumentErrorWeb', methods=['GET']) |
|||
def checkDocumentErrorWebSse(): |
|||
filename = request.args.get('filename') |
|||
|
|||
def generate_checkDocumentError(filename): |
|||
id = 0 |
|||
for i in getDocumentError(filename): |
|||
yield f"id: {id + 1}\n" |
|||
yield f"event: getDocumentError\n" |
|||
yield f"data: {i}\n\n" # 发送完成信号 |
|||
|
|||
headers = { |
|||
"Content-Type": "text/event-stream", |
|||
"Cache-Control": "no-cache", |
|||
"X-Accel-Buffering": "no", |
|||
"Access-Control-Allow-Origin": "*", |
|||
"Access-Control-Allow-Methods": "GET,POST", |
|||
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
|||
} |
|||
return Response(generate_checkDocumentError(filename), headers=headers) |
|||
@app.route('/sse/checkTitleName', methods=['GET']) |
|||
def checkTitleNameWebSse(): |
|||
filename = request.args.get('filename') |
|||
|
|||
def generate_checkTitleName(filename): |
|||
id = 0 |
|||
for i in checkTitleName(filename): |
|||
yield f"id: {id + 1}\n" |
|||
yield f"event: checkTitleName\n" |
|||
yield f"data: {i}\n\n" # 发送完成信号 |
|||
|
|||
headers = { |
|||
"Content-Type": "text/event-stream", |
|||
"Cache-Control": "no-cache", |
|||
"X-Accel-Buffering": "no", |
|||
"Access-Control-Allow-Origin": "*", |
|||
"Access-Control-Allow-Methods": "GET,POST", |
|||
"Access-Control-Allow-Headers": "x-requested-with,content-type", |
|||
} |
|||
return Response(generate_checkTitleName(filename), headers=headers) |
|||
if __name__ == '__main__': |
|||
app.run(host="0.0.0.0",port=80) |
@ -0,0 +1,132 @@ |
|||
import pprint |
|||
import urllib.parse |
|||
import json5 |
|||
from qwen_agent.agents import Assistant |
|||
from qwen_agent.tools.base import BaseTool, register_tool |
|||
import requests |
|||
import baidusearch |
|||
import tqdm |
|||
|
|||
# 使用示例 |
|||
|
|||
|
|||
|
|||
# Step 1 (Optional): Add a custom tool named `my_image_gen`. |
|||
@register_tool('my_image_gen') |
|||
class MyImageGen(BaseTool): |
|||
# The `description` tells the agent the functionality of this tool. |
|||
description = 'AI painting (image generation) service, input text description, and return the image URL drawn based on text information.' |
|||
# The `parameters` tell the agent what input parameters the tool has. |
|||
parameters = [{ |
|||
'name': 'prompt', |
|||
'type': 'string', |
|||
'description': 'Detailed description of the desired image content, in English', |
|||
'required': True |
|||
}] |
|||
|
|||
def call(self, params: str, **kwargs) -> str: |
|||
# `params` are the arguments generated by the LLM agent. |
|||
prompt = json5.loads(params)['prompt'] |
|||
# 对提示词进行URL编码 |
|||
prompt = urllib.parse.quote(prompt) |
|||
# |
|||
return json5.dumps( |
|||
{'image_url': f'https://image.pollinations.ai/prompt/{prompt}'}, |
|||
ensure_ascii=False) |
|||
|
|||
|
|||
@register_tool('chaxun') |
|||
class MyImageGen(BaseTool): |
|||
# The `description` tells the agent the functionality of this tool. |
|||
description = '如果你不会,请使用此工具进行联网查询' |
|||
# The `parameters` tell the agent what input parameters the tool has. |
|||
parameters = [{ |
|||
'name': 'prompt', |
|||
'type': 'string', |
|||
'description': '请你描述需要提问的信息,以此帮助你了解更多的信息', |
|||
'required': True |
|||
}] |
|||
|
|||
def call(self, params: str, **kwargs) -> str: |
|||
# `params` are the arguments generated by the LLM agent. |
|||
prompt = json5.loads(params)['prompt'] |
|||
# 对提示词进行URL编码 |
|||
prompt = urllib.parse.quote(prompt) |
|||
# |
|||
search_tool = baidusearch.search(prompt, num_results=20) |
|||
print(search_tool) |
|||
return search_tool |
|||
# Step 2: Configure the LLM you are using. |
|||
# 这里是需要配置模型的地方。需要填写模型名字,以及model_server,即模型所在服务器名字,如果没有,也可以考虑使用api_key。 |
|||
llm_cfg = { |
|||
# Use the model service provided by DashScope: |
|||
# model:模型名称 |
|||
# model_server:模型所在的服务器 |
|||
# api_key: 所使用到的api-key,可以显示的设置,也可以从环境变量中获取 |
|||
|
|||
'model':"qwen2-72b-instruct", |
|||
'model_server': 'DashScope', # base_url, also known as api_base |
|||
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
# 'api_key': 'YOUR_DASHSCOPE_API_KEY', |
|||
# It will use the `DASHSCOPE_API_KEY' environment variable if 'api_key' is not set here. |
|||
|
|||
# Use a model service compatible with the OpenAI API, such as vLLM or Ollama: |
|||
# 'model': 'Qwen1.5-7B-Chat', |
|||
# 'model_server': 'http://localhost:8000/v1', # base_url, also known as api_base |
|||
# 'api_key': 'EMPTY', |
|||
|
|||
# (Optional) LLM hyperparameters for generation: |
|||
# 用于调整生成参数的可选配置 |
|||
'generate_cfg': { |
|||
'top_p': 0.8 |
|||
} |
|||
} |
|||
|
|||
# Step 3: Create an agent. Here we use the `Assistant` agent as an example, which is capable of using tools and reading files. |
|||
|
|||
# agent的提示词指令 |
|||
system_instruction = ''' |
|||
你是一个乐于助人的助手。 |
|||
收到用户的请求后,您应: |
|||
你应该进行思考,判断是否使用工具, |
|||
如果遇到你不会回答,请使用工具[chaxun] |
|||
''' |
|||
|
|||
# 工具列表,指定Assistant可以访问的工具,一个是自定义的工具,一个是代码执行器 |
|||
tools = ["chaxun"] # `code_interpreter` is a built-in tool for executing code. |
|||
# 助理可以读取的文件路径 |
|||
# files = ['./examples/resource/doc.pdf'] # Give the bot a PDF file to read. |
|||
|
|||
# 初始化Assistant |
|||
bot = Assistant(llm=llm_cfg, |
|||
system_message=system_instruction, |
|||
function_list=tools, |
|||
# files=files |
|||
) |
|||
|
|||
# Step 4: Run the agent as a chatbot. |
|||
messages = [] # This stores the chat history. |
|||
def getxinx(context): |
|||
# For example, enter the query "draw a dog and rotate it 90 degrees". |
|||
# query = input('user query: ') |
|||
# Append the user query to the chat history. |
|||
messages=[({'role': 'user', 'content': context})] |
|||
print(messages) |
|||
response = [] |
|||
event_id = 0 |
|||
for rsp in bot.run(messages=messages): |
|||
response.append(rsp) |
|||
yield "请稍等.." |
|||
# len() |
|||
# for i in bot.run(messages=messages): |
|||
# # for number in range(1, 10): |
|||
# print(i) |
|||
# print(i[len(i)-1]['content']) |
|||
# event_id += 1 |
|||
# yield f"id: {event_id}\n" |
|||
# yield f"event: time-update\n" |
|||
# if(i[len(i)-1]['role']=='assistant'): |
|||
# yield "data: {}\n\n".format(str(i[len(i)-1]['content'].replace('\n\n',''))) # 每次生成一个数字就发送 |
|||
# else: |
|||
# yield f"data: \n\n" # 每次生成一个数字就发送 |
|||
# Streaming output. |
@ -0,0 +1,109 @@ |
|||
import time |
|||
import json |
|||
import math |
|||
from flask import Flask,Response,request |
|||
from flask_sse import sse |
|||
from flask_cors import CORS |
|||
import re |
|||
import qwen_agenttext |
|||
app = Flask(__name__) |
|||
cros = CORS(app) |
|||
# SSE 推送函数 |
|||
import paddle; |
|||
paddle.device.get_available_device() |
|||
|
|||
|
|||
# SSE 推送路由 |
|||
|
|||
|
|||
# @app.route('/register', methods=["GET"]) |
|||
# def register(): |
|||
# 获取客户端标识符 |
|||
# client_id = str(uuid.uuid4()) |
|||
# |
|||
# # 返回 SSE 响应 |
|||
# return jsonify({"client_id": client_id}) |
|||
|
|||
|
|||
# SSE 推送路由 |
|||
|
|||
|
|||
# @app.route('/sse', methods=['POST']) |
|||
# def stream(): |
|||
# # 获取客户端标识符 |
|||
# client_id = 1 |
|||
# print("client_id", client_id) |
|||
# |
|||
# def aa(): |
|||
# # 循环发送 SSE 数据 |
|||
# for i in range(10): |
|||
# data = 'Hello, %s!' % client_id + str(i) |
|||
# print(data) |
|||
# sse.publish(data, channel=client_id, type='message') |
|||
# time.sleep(1) |
|||
# sse.publish("end", channel=client_id, type='message') |
|||
# |
|||
# # 返回 SSE 响应 |
|||
# response = Response(aa(), mimetype='text/event-stream') |
|||
# response.headers.add('Cache-Control', 'no-cache') |
|||
# response.headers.add('Connection', 'keep-alive') |
|||
# response.headers.add('X-Accel-Buffering', 'no') |
|||
# return response |
|||
# |
|||
# |
|||
# |
|||
# @app.route('/stream' ,methods=["GET", "POST"]) |
|||
# def stream_numbers(): |
|||
# context= request.args.get('context') |
|||
# |
|||
# |
|||
# headers = { |
|||
# "Content-Type": "text/event-stream", |
|||
# "Cache-Control": "no-cache", |
|||
# "X-Accel-Buffering": "no", |
|||
# "Access-Control-Allow-Origin": "*", |
|||
# "Access-Control-Allow-Methods": "GET,POST", |
|||
# "Access-Control-Allow-Headers": "x-requested-with,content-type", |
|||
# } |
|||
# return Response(generate_numbers(),headers=headers) |
|||
# def generate_numbers(): |
|||
# event_id=0 |
|||
# # for number in range(1, 10): |
|||
# # json_data = json.dumps({"number": number}) |
|||
# # print(json_data) |
|||
# # event_id += 1 |
|||
# # yield f"id: {event_id}\n" |
|||
# # yield f"event: time-update\n" |
|||
# # yield f"data: {json_data}\n\n" # 每次生成一个数字就发送 |
|||
# json_data = json.dumps({"number": "done"}) |
|||
# yield f"id: {1}\n" |
|||
# yield f"event: time-update\n" |
|||
# yield f"data: 34568\n\n" # 发送完成信号 |
|||
# if __name__ == '__main__': |
|||
# |
|||
# |
|||
# # 读取文件内容 |
|||
# with open("checkPlaceName.txt", "r", encoding='utf-8') as f: |
|||
# gettext = f.read() |
|||
# batchNum=20 |
|||
# sentences = re.split(r'[。\n]', gettext) |
|||
# # 去掉空字符 |
|||
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
# # 计算总字符数 |
|||
# total_chars = len(sentences) |
|||
# |
|||
# # 计算有多少份 |
|||
# num_chunks = math.ceil(total_chars / batchNum) |
|||
# |
|||
# # 按batchNum字为一份进行处理 |
|||
# chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
|||
# |
|||
# # 打印每一份的内容 |
|||
# for i, chunk in enumerate(chunks): |
|||
# print(f"Chunk {i + 1}:") |
|||
# print(chunk) |
|||
# print("-" * 40) |
|||
# |
|||
# # 打印总份数 |
|||
# print(f"Total chunks: {num_chunks}") |
|||
# app.run(debug=True,port=80) |
After Width: | Height: | Size: 420 KiB |
After Width: | Height: | Size: 245 KiB |
After Width: | Height: | Size: 117 KiB |
After Width: | Height: | Size: 17 KiB |
After Width: | Height: | Size: 62 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 34 KiB |
After Width: | Height: | Size: 24 KiB |
After Width: | Height: | Size: 211 KiB |
After Width: | Height: | Size: 916 KiB |
After Width: | Height: | Size: 217 KiB |
After Width: | Height: | Size: 252 KiB |
After Width: | Height: | Size: 904 KiB |
@ -0,0 +1,12 @@ |
|||
{ |
|||
"shell_port": 3199, |
|||
"iopub_port": 3205, |
|||
"stdin_port": 3200, |
|||
"control_port": 3201, |
|||
"hb_port": 3209, |
|||
"ip": "127.0.0.1", |
|||
"key": "41711130-ba4287db5e2a6e7b98444c31", |
|||
"transport": "tcp", |
|||
"signature_scheme": "hmac-sha256", |
|||
"kernel_name": "" |
|||
} |
@ -0,0 +1,12 @@ |
|||
{ |
|||
"shell_port": 36295, |
|||
"iopub_port": 36301, |
|||
"stdin_port": 36296, |
|||
"control_port": 36297, |
|||
"hb_port": 36305, |
|||
"ip": "127.0.0.1", |
|||
"key": "0faec31a-0f91a316abd70cf50f57dbad", |
|||
"transport": "tcp", |
|||
"signature_scheme": "hmac-sha256", |
|||
"kernel_name": "" |
|||
} |
@ -0,0 +1,12 @@ |
|||
{ |
|||
"shell_port": 5355, |
|||
"iopub_port": 5362, |
|||
"stdin_port": 5356, |
|||
"control_port": 5358, |
|||
"hb_port": 5366, |
|||
"ip": "127.0.0.1", |
|||
"key": "de89d28a-7beb5da33100363d2c20fd6b", |
|||
"transport": "tcp", |
|||
"signature_scheme": "hmac-sha256", |
|||
"kernel_name": "" |
|||
} |
@ -0,0 +1,12 @@ |
|||
{ |
|||
"shell_port": 3079, |
|||
"iopub_port": 3085, |
|||
"stdin_port": 3080, |
|||
"control_port": 3081, |
|||
"hb_port": 3089, |
|||
"ip": "127.0.0.1", |
|||
"key": "1825b8a3-a33137bc69e3375f26f384a3", |
|||
"transport": "tcp", |
|||
"signature_scheme": "hmac-sha256", |
|||
"kernel_name": "" |
|||
} |
@ -0,0 +1,12 @@ |
|||
{ |
|||
"shell_port": 36740, |
|||
"iopub_port": 36746, |
|||
"stdin_port": 36741, |
|||
"control_port": 36742, |
|||
"hb_port": 36750, |
|||
"ip": "127.0.0.1", |
|||
"key": "ac6de478-4a3be71d79c2c63da7065148", |
|||
"transport": "tcp", |
|||
"signature_scheme": "hmac-sha256", |
|||
"kernel_name": "" |
|||
} |
@ -0,0 +1,12 @@ |
|||
{ |
|||
"shell_port": 2563, |
|||
"iopub_port": 2569, |
|||
"stdin_port": 2564, |
|||
"control_port": 2565, |
|||
"hb_port": 2573, |
|||
"ip": "127.0.0.1", |
|||
"key": "7e020774-be96933cbe5aaad90c1c9bfc", |
|||
"transport": "tcp", |
|||
"signature_scheme": "hmac-sha256", |
|||
"kernel_name": "" |
|||
} |
@ -0,0 +1,12 @@ |
|||
{ |
|||
"shell_port": 5840, |
|||
"iopub_port": 5846, |
|||
"stdin_port": 5841, |
|||
"control_port": 5842, |
|||
"hb_port": 5850, |
|||
"ip": "127.0.0.1", |
|||
"key": "e4c27d68-1c3a9dfa16551f35481b05b8", |
|||
"transport": "tcp", |
|||
"signature_scheme": "hmac-sha256", |
|||
"kernel_name": "" |
|||
} |
@ -0,0 +1,3 @@ |
|||
|
|||
from ipykernel import kernelapp as app |
|||
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||
|
|||
from ipykernel import kernelapp as app |
|||
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||
|
|||
from ipykernel import kernelapp as app |
|||
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||
|
|||
from ipykernel import kernelapp as app |
|||
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||
|
|||
from ipykernel import kernelapp as app |
|||
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||
|
|||
from ipykernel import kernelapp as app |
|||
app.launch_new_instance() |
@ -0,0 +1,3 @@ |
|||
|
|||
from ipykernel import kernelapp as app |
|||
app.launch_new_instance() |
After Width: | Height: | Size: 2.8 MiB |
@ -0,0 +1,140 @@ |
|||
from docx import Document |
|||
from paddlenlp import Taskflow |
|||
from pprint import pprint |
|||
from qwen_agent.agents import Assistant |
|||
import re |
|||
import json_repair |
|||
import time |
|||
import math |
|||
tagTask = Taskflow("ner") |
|||
prompt=''' |
|||
.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
不做过多的解释,严格按回答格式作答; |
|||
''' |
|||
# prompt=''' |
|||
# .请回答以上问题, |
|||
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
# 不做过多的解释,严格按回答格式作答; |
|||
# ''' |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
|||
) |
|||
#获取全文内容 |
|||
def getDocxToTextAll(name): |
|||
docxPath=name |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
# print("非空") |
|||
words.append(text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
print("placeNameTask",len(words)) |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file: |
|||
txt_file.write(text) |
|||
|
|||
#得到全文和地名有关的内容 |
|||
def placeNameTask(text): |
|||
batchNum=20 |
|||
sentences = re.split(r'[。\n]', text) |
|||
# 去掉空字符 |
|||
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
# 计算总字符数 |
|||
total_chars = len(sentences) |
|||
|
|||
# 计算有多少份 |
|||
num_chunks = math.ceil(total_chars / batchNum) |
|||
|
|||
# 按batchNum字为一份进行处理 |
|||
chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
|||
placeList = [] |
|||
# 打印每一份的内容 |
|||
for i, chunk in enumerate(chunks): |
|||
yield f"文档地名检查---文档解析进度:{i + 1}/{num_chunks}" |
|||
|
|||
wenBen=".".join(chunk) |
|||
print(chunk) |
|||
res = tagTask(wenBen) |
|||
isplace = False |
|||
for zuhe in res: |
|||
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
|||
if isplace: |
|||
name = placeList[len(placeList) - 1] |
|||
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: # or zuhe[1] == "ns" |
|||
isplace = True |
|||
new_text = zuhe[0].replace("\n", "") |
|||
placeList[len(placeList) - 1] = name + new_text |
|||
continue |
|||
if zuhe[1].find("组织机构类") >= 0 or zuhe[1].find("世界地区类") >= 0: |
|||
isplace = True |
|||
new_text = zuhe[0].replace("\n", "") |
|||
placeList.append(new_text) |
|||
else: |
|||
isplace = False |
|||
print("-" * 40) |
|||
# 打印总份数 |
|||
yield "文档地名检查---文档解析完成" |
|||
placeList=list(dict.fromkeys(placeList)) |
|||
yield placeList |
|||
#主方法 |
|||
def checkPlaceName(filename): |
|||
yield f"文档地名检查---开始处理文档..." # 每次生成一个数字就发送 |
|||
getDocxToTextAll(filename) |
|||
with open("checkPlaceName.txt", "r",encoding='utf-8') as f: |
|||
gettext = f.read() |
|||
yield f"文档地名检查---开始解析文档..." # 每次生成一个数字就发送 |
|||
# propnList=placeNameTask(gettext) |
|||
for item in placeNameTask(gettext): |
|||
if isinstance(item, str): |
|||
yield item |
|||
else: |
|||
final_list = item # 获取最终结果 |
|||
propnStr = ",".join(final_list) |
|||
print("placeNameTask",propnStr) |
|||
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}] |
|||
runList = [] |
|||
yield f"文档地名检查---结果生成中..." # 每次生成一个数字就发送 |
|||
cishu=0 |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
if cishu>3: |
|||
cishu=0 |
|||
yield "文档地名检查---结果生成中"+'.'*cishu |
|||
cishu+=1 |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
print("placeNameTask",data) |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
|
|||
# 如果需要进一步操作,例如只关注“正确”的回答 |
|||
error_places = [place for place in parsed_data if place['回答'] == '错误'] |
|||
print("placeNameTask",error_places) |
|||
returnInfo = "发现异常地名<br />"; |
|||
if len(error_places)>0: |
|||
for t in error_places: |
|||
keyword= t['placeName'] |
|||
# 查找包含关键字的段落 |
|||
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
|||
yuanwen= paragraphs[0].replace(keyword,f"**{keyword}**").replace("\n","") |
|||
returnInfo+="原文:" + yuanwen + "<br />出现异常地名:**" + keyword + "**!请注意" + "<br />"; |
|||
yield returnInfo |
|||
print(returnInfo) |
|||
else: |
|||
yield "**未发现发现异常地名**" |
@ -0,0 +1,118 @@ |
|||
import re |
|||
import time |
|||
from docx import Document |
|||
from pprint import pprint |
|||
# from paddlenlp import Taskflow |
|||
# |
|||
# similarity = Taskflow("text_similarity", truncation=True, max_length=102400) |
|||
|
|||
|
|||
def getOutlineLevel(inputXml): |
|||
""" |
|||
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
|||
参数 inputXml |
|||
返回 number |
|||
""" |
|||
start_index = inputXml.find('<w:outlineLvl') |
|||
end_index = inputXml.find('>', start_index) |
|||
number = inputXml[start_index:end_index + 1] |
|||
number = re.search("\d+", number).group() |
|||
return number |
|||
|
|||
|
|||
def isTitle(paragraph): |
|||
""" |
|||
功能 判断该段落是否设置了大纲等级 |
|||
参数 paragraph:段落 |
|||
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
|||
""" |
|||
# 如果是空行,直接返回None |
|||
if paragraph.text.strip() == '': |
|||
return None |
|||
|
|||
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
|||
paragraphXml = paragraph._p.xml |
|||
if paragraphXml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(paragraphXml) |
|||
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
|||
targetStyle = paragraph.style |
|||
while targetStyle is not None: |
|||
# 如果在该级style中找到了大纲级别,返回 |
|||
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(targetStyle.element.xml) |
|||
else: |
|||
targetStyle = targetStyle.base_style |
|||
# 如果在段落、样式里都没有找到大纲级别,返回None |
|||
return None |
|||
|
|||
def getDocxToText12biaoti(name): |
|||
document = Document(name) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
levelText="" |
|||
i = 0 |
|||
firstTitle = 0 |
|||
secondTitle = 0 |
|||
sanjiTitle = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
|
|||
if text.strip():#非空判断 |
|||
# print("非空") |
|||
# words.append(text) |
|||
level = isTitle(paragraph) |
|||
if level=="0": |
|||
firstTitle+=1 |
|||
secondTitle = 0 |
|||
if(text.find("附件")>=0): |
|||
continue |
|||
words.append("{}:".format(firstTitle)+text) |
|||
elif level=="1": |
|||
secondTitle+=1 |
|||
sanjiTitle=0 |
|||
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
|||
words.append("{}.{}".format(firstTitle,secondTitle)+text) |
|||
elif level=="2": |
|||
sanjiTitle += 1 |
|||
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
|||
words.append("{}.{}.{}".format(firstTitle, secondTitle,sanjiTitle) + text) |
|||
|
|||
|
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
print(len(words)) |
|||
if len(words)==0: |
|||
raise Exception("I know python!") |
|||
text = '\n'.join(words) |
|||
with open("ce1.txt", 'w',encoding="utf-8") as txt_file: |
|||
txt_file.write(text) |
|||
return words |
|||
mobanList=[] |
|||
dangqianList=[] |
|||
errorList =[] |
|||
# 将文本写入txt文件 |
|||
# with open("ce模板.txt", 'r',encoding="utf-8") as txt_file: |
|||
# for i in txt_file: |
|||
# i=re.sub(r'[\t\n]', '', i) |
|||
# mobanList.append(i) |
|||
# pprint(mobanList) |
|||
# dangqianList=getDocxToText12biaoti("1.docx") |
|||
# if len(dangqianList)!=len(mobanList): |
|||
# print("标题数量与模板不一致") |
|||
# for num in range(len(mobanList)): |
|||
# moban = mobanList[num] |
|||
# dangqian= dangqianList[num] |
|||
# fenshu=similarity([[dangqian,moban]]) |
|||
# pprint(fenshu) |
|||
# if (fenshu[0]["similarity"]<0.85): |
|||
# errorList.append(dangqianList) |
|||
# getDocxToText12biaoti("1.docx") |
|||
# pprint(errorList) |
|||
|
|||
prompt = '''{}这是文档大纲,根据大纲分析文档中是否有{}这块内容的描述,若不存在请回答不存在 |
|||
''' |
|||
dagang ="1" |
|||
biaozhun="2" |
|||
print(prompt.format(dagang, biaozhun)) |
@ -0,0 +1,282 @@ |
|||
import re |
|||
import os |
|||
import docx |
|||
from docx.document import Document |
|||
from docx.text.paragraph import Paragraph |
|||
from docx.parts.image import ImagePart |
|||
from qwen_agent.agents import Assistant |
|||
|
|||
from docx.oxml.table import CT_Tbl |
|||
from docx.oxml.text.paragraph import CT_P |
|||
|
|||
import shutil |
|||
import re |
|||
import json_repair |
|||
import uuid |
|||
|
|||
# 记录程序开始的时间戳 |
|||
def getOutlineLevel(inputXml): |
|||
""" |
|||
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
|||
参数 inputXml |
|||
返回 number |
|||
""" |
|||
start_index = inputXml.find('<w:outlineLvl') |
|||
end_index = inputXml.find('>', start_index) |
|||
number = inputXml[start_index:end_index + 1] |
|||
number = re.search("\d+", number).group() |
|||
return number |
|||
|
|||
|
|||
def isTitle(paragraph): |
|||
""" |
|||
功能 判断该段落是否设置了大纲等级 |
|||
参数 paragraph:段落 |
|||
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
|||
""" |
|||
# 如果是空行,直接返回None |
|||
if paragraph.text.strip() == '': |
|||
return None |
|||
|
|||
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
|||
paragraphXml = paragraph._p.xml |
|||
if paragraphXml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(paragraphXml) |
|||
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
|||
targetStyle = paragraph.style |
|||
while targetStyle is not None: |
|||
# 如果在该级style中找到了大纲级别,返回 |
|||
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(targetStyle.element.xml) |
|||
else: |
|||
targetStyle = targetStyle.base_style |
|||
# 如果在段落、样式里都没有找到大纲级别,返回None |
|||
return None |
|||
|
|||
|
|||
# 该行只能有一个图片 |
|||
def is_image(graph: Paragraph, doc: Document): |
|||
images = graph._element.xpath('.//pic:pic') # 获取所有图片 |
|||
for image in images: |
|||
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id |
|||
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 |
|||
if isinstance(part, ImagePart): |
|||
return True |
|||
return False |
|||
|
|||
|
|||
# 获取图片(该行只能有一个图片) |
|||
def get_ImagePart(graph: Paragraph, doc: Document): |
|||
images = graph._element.xpath('.//pic:pic') # 获取所有图片 |
|||
for image in images: |
|||
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id |
|||
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 |
|||
if isinstance(part, ImagePart): |
|||
return part |
|||
return None |
|||
#寻找标题名称 |
|||
def findTitleName(docxPath): |
|||
yield '文档图片信息检查----检查是否存在详细设计方案' |
|||
document = docx.Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
titleWords=[] |
|||
firstTitle = 0 |
|||
secondTitle = 0 |
|||
sanjiTitle = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
level = isTitle(paragraph) |
|||
if level=="0": |
|||
firstTitle+=1 |
|||
secondTitle = 0 |
|||
if(text.find("附件")>=0): |
|||
continue |
|||
titleWords.append("一级标题:".format(firstTitle)+text) |
|||
elif level=="1": |
|||
secondTitle+=1 |
|||
sanjiTitle=0 |
|||
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
|||
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text) |
|||
elif level=="2": |
|||
sanjiTitle += 1 |
|||
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) |
|||
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) |
|||
findTitleName_llm_cfg = { |
|||
# 'model':"qwen2-72b", |
|||
# 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
'model': "qwen2-72b-instruct", |
|||
'model_server': 'DashScope', # base_url, also known as api_base |
|||
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, |
|||
name='Assistant', |
|||
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' |
|||
) |
|||
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 |
|||
类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择 |
|||
类似设计方案,服务方案,建设方案为次相关,次级选择 |
|||
类似方案是最后选择 |
|||
按照这样的顺序选择最合适的 |
|||
你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答 |
|||
''' |
|||
# print("\n".join(titleWords)+prompt) |
|||
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})] |
|||
runList=[] |
|||
for rsp in findTitleName_bot.run(messages): |
|||
runList.append(rsp) |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
print(parsed_data) |
|||
if(parsed_data["answer"]=="存在"): |
|||
print("存在",parsed_data["name"]) |
|||
yield parsed_data["name"] |
|||
else: |
|||
print("不存在",parsed_data["name"]) |
|||
yield "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查" |
|||
def saveImage(fileName,titleName,imagePath): |
|||
fristName="" |
|||
doc = docx.Document(fileName) |
|||
for paragraph in doc.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip(): # 非空判断 |
|||
level = isTitle(paragraph) |
|||
if level == "0": |
|||
fristName = text |
|||
print(text) |
|||
if level: |
|||
levelText = f"{int(level) + 1}级标题-" + text |
|||
else: |
|||
# 空说明是表格或者图片 |
|||
r = is_image(paragraph, doc) |
|||
if r and fristName == titleName: |
|||
part = get_ImagePart(paragraph, doc) |
|||
img_name = levelText+"_"+ os.path.basename(part.partname) |
|||
with open(f'{imagePath}/{img_name}', "wb") as f: |
|||
f.write(part.blob) |
|||
#保存完成后,上传大模型进行分析 |
|||
def checkImageText(filename): |
|||
llm_cfg_vl = { |
|||
#'model': 'qwen1.5-72b-chat',qwen2-72b-instruct |
|||
'model':"qwen-vl-max", |
|||
'model_server': 'DashScope', # base_url, also known as api_base |
|||
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
botImage = Assistant(llm=llm_cfg_vl, |
|||
name='Assistant', |
|||
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4 |
|||
) |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b-instruct", |
|||
'model_server': 'DashScope', # base_url, also known as api_base |
|||
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
|||
|
|||
) |
|||
for titleName in findTitleName(filename): |
|||
yield titleName |
|||
if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"): |
|||
yield "文档图片信息检查----文档内容解析中" |
|||
imagePath = "Image" + str(uuid.uuid4()) |
|||
os.mkdir(imagePath) |
|||
saveImage(filename,titleName,imagePath) |
|||
imagePathList = os.listdir(imagePath) |
|||
count = 0 |
|||
resMap={} |
|||
for image in imagePathList: |
|||
count+=1 |
|||
yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}" |
|||
outpath=os.path.join("imagePath", image) |
|||
print(outpath) |
|||
messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}] |
|||
runListImage = [] |
|||
for rsp in botImage.run(messagesImage): |
|||
runListImage.append(rsp) |
|||
data = runListImage[len(runListImage) - 1][0]["content"] |
|||
print(str(data)) |
|||
prompt=''' |
|||
依次上述内容是否与文档有关,你只能在[无关,有关]选项中选择答案, |
|||
按照这样的格式回答[{“text”:“内容”,"answer":"答案"},{“text”:“内容”,"answer":"答案"}]不做过多的解释,严格按回答格式作答 |
|||
''' |
|||
messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}] |
|||
runList = [] |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
textdata = runList[len(runList) - 1][0]["content"] |
|||
print(textdata) |
|||
parsed_data = json_repair.loads(textdata) |
|||
print(parsed_data) |
|||
for res in parsed_data: |
|||
if (res["answer"] == "无关"): |
|||
print("无关", res["name"]) |
|||
map = resMap.get(image) |
|||
if map: |
|||
#存在map说明之前已经保存过了 |
|||
resMap[image]=map+","+res["text"] |
|||
else: |
|||
resMap[image]=res["text"] |
|||
out='' |
|||
if(len(resMap)>0): |
|||
for key,value in resMap: |
|||
out+=f"在{key}图片中,{value}以上内容在文档中未出现相关描述<br>" |
|||
yield out |
|||
else: |
|||
yield "文档图片信息检查----图文符合要求" |
|||
shutil.rmtree(imagePath) |
|||
# except Exception as e: |
|||
# yield f"文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查" |
|||
# return |
|||
for i in checkImageText("1.docx"): |
|||
print(i) |
|||
# import docx |
|||
# doc = docx.Document('1.docx') |
|||
# dict_rel = doc.part._rels # rels其实是个目录 |
|||
# for rel in dict_rel: |
|||
# rel = dict_rel[rel] |
|||
# print("rel", rel.target_ref) |
|||
# if "image" in rel.target_ref: |
|||
# # create_dir(desc_path) |
|||
# img_name = re.findall("/(.*)", rel.target_ref)[0] # windos:/ |
|||
# print("img_name", img_name) |
|||
# word_name = os.path.splitext("1.docx")[0] |
|||
# print("word_name", word_name) |
|||
# #检查文件路径分隔符(os.sep),并根据不同的操作系统(Windows或Unix/Linux)处理文件名。 |
|||
# if os.sep in word_name: |
|||
# new_name = word_name.split('\\')[-1] |
|||
# else: |
|||
# new_name = word_name.split('/')[-1] |
|||
# img_name = f'{new_name}_{img_name}' |
|||
# print(img_name) |
|||
# desc_path='workspace' |
|||
# with open(f'{desc_path}/{img_name}', "wb") as f: |
|||
# f.write(rel.target_part.blob) |
|||
# # |
|||
# # # prompt=''' |
|||
# # # .根据上述文本判断,是否为非泛化的公司或组织名称,你可以使用工具利用互联网查询,你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校,委员单位]选项中选择答案,回答格式[{“placeName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
# # # ''' |
|||
# llm_cfg_vl = { |
|||
# #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct |
|||
# 'model':"qwen-vl-max", |
|||
# 'model_server': 'DashScope', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
# } |
|||
# botvl = Assistant(llm=llm_cfg_vl, |
|||
# name='Assistant', |
|||
# # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4 |
|||
# ) |
|||
# messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}] |
|||
# runList = [] |
|||
# for rsp in botvl.run(messages): |
|||
# runList.append(rsp) |
|||
# print(rsp) |
|||
# data = runList[len(runList) - 1][0]["content"] |
|||
# print(str(data)) |
|||
|
@ -0,0 +1,133 @@ |
|||
# -*- coding:utf-8 -*- |
|||
import time |
|||
from docx import Document |
|||
from paddlenlp import Taskflow |
|||
from qwen_agent.agents import Assistant |
|||
import re |
|||
import json_repair |
|||
wordtag = Taskflow("knowledge_mining") |
|||
|
|||
prompt = ''' |
|||
.根据上述文本判断,是否为具体的公司或组织名称,你可以使用工具利用互联网查询, |
|||
你只能在[具体的公司或组织名称,公益组织,简称,统称,泛化组织,政府单位,机关单位,学校,行业类型,其他]选项中选择答案, |
|||
回答格式[{“companyName”:“名称”,"回答":"答案"},{“companyName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
''' |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具" |
|||
) |
|||
|
|||
def getDocxToTextAll(name): |
|||
docxPath=name |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
# print("非空") |
|||
words.append(text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
print("checkCompanyName",len(words)) |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkCompanyName.txt", 'w', encoding='utf-8') as txt_file: |
|||
txt_file.write(text) |
|||
def checkCompanyName(filename): |
|||
getDocxToTextAll(filename) |
|||
start_time=time.time() |
|||
error_places = [] |
|||
for batch in read_file_in_batches('checkCompanyName.txt'): |
|||
res=process_batch(batch) |
|||
if(len(res)>0): |
|||
error_places.extend(res) |
|||
|
|||
print(error_places) |
|||
end_time = time.time() |
|||
# 计算执行时间 |
|||
elapsed_time = end_time - start_time |
|||
print(f"checkCompanyName程序执行时间: {elapsed_time} 秒") |
|||
return error_places |
|||
|
|||
def read_file_in_batches(file_path, batch_size=5000): |
|||
""" |
|||
分批读取文本文件 |
|||
:param file_path: 文件路径 |
|||
:param batch_size: 每批处理的字符数 |
|||
:return: 生成器,每次返回一批文本 |
|||
""" |
|||
with open(file_path, 'r', encoding='utf-8') as file: |
|||
batch = [] |
|||
char_count = 0 |
|||
for line in file: |
|||
batch.append(line) |
|||
char_count += len(line) |
|||
if char_count >= batch_size: |
|||
yield ''.join(batch) |
|||
batch = [] |
|||
char_count = 0 |
|||
if batch: |
|||
yield ''.join(batch) |
|||
|
|||
def process_batch(batch): |
|||
""" |
|||
处理一批文本 |
|||
:param batch: 一批文本 |
|||
""" |
|||
# 在这里添加你的处理逻辑 |
|||
|
|||
# sentences = re.split(r'[。\n]', batch) |
|||
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
res=wordtag(batch) |
|||
placeList = [] |
|||
isplace = False |
|||
for zuhe in res[0]['items']: |
|||
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
|||
zhi = zuhe.get("wordtag_label") |
|||
if isplace: |
|||
name = placeList[len(placeList) - 1] |
|||
if zhi.find("组织机构类")>=0 : # or zuhe[1] == "ns" |
|||
isplace = True |
|||
new_text = zuhe['item'].replace("\n", "") |
|||
placeList[len(placeList) - 1] = name + new_text |
|||
continue |
|||
if zhi.find("组织机构类")>=0 : |
|||
isplace = True |
|||
new_text = zuhe['item'].replace("\n", "") |
|||
placeList.append(new_text) |
|||
else: |
|||
isplace = False |
|||
placeList=list(dict.fromkeys(placeList)) |
|||
placeStr = ",".join(placeList) |
|||
messages = [{'role': 'user', 'content': [{'text': placeStr+prompt}]}] |
|||
print("checkCompanyName",placeStr+prompt) |
|||
runList = [] |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
print("checkCompanyName",data) |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
error_places = [place for place in parsed_data if place['回答'] == '具体的公司或组织名称'] |
|||
print("checkCompanyName",error_places) |
|||
if len(error_places)>0: |
|||
for t in error_places: |
|||
keyword= t['companyName'] |
|||
# 查找包含关键字的段落 |
|||
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch) |
|||
t["yuanwen"]=paragraphs[0] |
|||
return error_places |
|||
else: |
|||
return error_places |
@ -0,0 +1,226 @@ |
|||
#-*- coding:utf-8 -*- |
|||
# from pycorrector import MacBertCorrector |
|||
# m = MacBertCorrector("shibing624/macbert4csc-base-chinese") |
|||
from qwen_agent.agents import Assistant |
|||
from docx import Document |
|||
from pprint import pprint |
|||
import re |
|||
from paddlenlp import Taskflow |
|||
import json |
|||
import time |
|||
import json_repair |
|||
print(json_repair.loads('{"name":""aaaa"}')) |
|||
start_time = time.time() |
|||
corrector = Taskflow("text_correction") |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
|||
|
|||
) |
|||
# prompt=''' |
|||
# 是否存在错别字,若存在请指出,不做其他方面的校验,你只能在[存在,不存在,未知]选项中选择答案, |
|||
# 回答格式[{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”:“原文”,"改正后":"改正的内容","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
# ''' |
|||
prompt=''' |
|||
请回答以上问题,[是,否]选项中选择答案,原文内容,标点符号保持不变,如果有错请给出解析,没有错则不用给解析 |
|||
回答格式请按照以下json格式[{"placeName":"序号","回答":"答案","jianyi","解析"},{"placeName":"序号","回答":"答案","jianyi","解析"}],不做过多的解释,严格按回答格式作答; |
|||
''' |
|||
def getDocxToTextAll(name): |
|||
docxPath=name |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
# print("非空") |
|||
words.append(text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
print("checkDocumentError",len(words)) |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file: |
|||
txt_file.write(text) |
|||
def getDocumentError(filename): |
|||
getDocxToTextAll(filename) |
|||
error_places = [] |
|||
# # 打开文件 |
|||
for batch in read_file_in_batches('checkDocumentError.txt'): |
|||
res=process_batch(batch) |
|||
if(len(res)>0): |
|||
error_places.extend(res) |
|||
|
|||
pprint(error_places) |
|||
end_time = time.time() |
|||
# 计算执行时间 |
|||
elapsed_time = end_time - start_time |
|||
print(f"checkDocumentError程序执行时间: {elapsed_time} 秒") |
|||
return error_places |
|||
# |
|||
# 过滤掉填充的None(如果有的话) |
|||
# chunk = [line for line in chunk if line is not None] |
|||
# res = m.correct_batch(sentences) |
|||
# print("DocumentError",res) |
|||
# lines_with_greeting = [place for place in res if len( place['errors'])>0] |
|||
# error_places.extend(lines_with_greeting) |
|||
# pprint(error_places) |
|||
# if len(lines_with_greeting)>0: |
|||
# for t in error_places: |
|||
# keyword= t['source'] |
|||
# |
|||
# errorWord=t["errors"] |
|||
# # 查找包含关键字的段落 |
|||
# paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
|||
# t["yuanwen"]=paragraphs[0] |
|||
# return error_places |
|||
# else: |
|||
# return error_places |
|||
# return lines_with_greeting |
|||
def read_file_in_batches(file_path, batch_size=5000): |
|||
""" |
|||
分批读取文本文件 |
|||
:param file_path: 文件路径 |
|||
:param batch_size: 每批处理的字符数 |
|||
:return: 生成器,每次返回一批文本 |
|||
""" |
|||
with open(file_path, 'r', encoding='utf-8') as file: |
|||
batch = [] |
|||
char_count = 0 |
|||
for line in file: |
|||
batch.append(line) |
|||
char_count += len(line) |
|||
if char_count >= batch_size: |
|||
yield ''.join(batch) |
|||
batch = [] |
|||
char_count = 0 |
|||
if batch: |
|||
yield ''.join(batch) |
|||
|
|||
def process_batch(batch): |
|||
""" |
|||
处理一批文本 |
|||
:param batch: 一批文本 |
|||
""" |
|||
# 在这里添加你的处理逻辑 |
|||
# error_places=[] |
|||
sentences = re.split(r'[。\n]', batch) |
|||
# 去掉空字符串 |
|||
sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
res = corrector(sentences) |
|||
lines_with_greeting = [place for place in res if len(place['errors']) > 0] |
|||
# error_places.extend(lines_with_greeting) |
|||
# pprint(error_places) |
|||
words='' |
|||
err=[] |
|||
if len(lines_with_greeting) > 0: |
|||
num=0 |
|||
wenti=[]#记录问题的数组 |
|||
keyword_list = []#记录问题 |
|||
for t in lines_with_greeting: |
|||
temp_errorWords = [] |
|||
keyword = t['source'] |
|||
keyword_list.append(keyword) |
|||
for item in t["errors"]: |
|||
for key, value in item['correction'].items(): |
|||
temp_errorWords.append(key) |
|||
wenti.append("{}、原文:{}。问题:【{}】这些字是否为当前原文的错别字".format(num,keyword,",".join(temp_errorWords))) |
|||
num+=1 |
|||
words ="\n".join(wenti) |
|||
|
|||
messages = [{'role': 'user', 'content': [{'text': words+ prompt}]}] |
|||
runList = [] |
|||
print(words+ prompt) |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
pprint(data) |
|||
parsed_data = json_repair.loads(data.replace("\\","").replace('`', '')) |
|||
err = [ |
|||
{**place, "placeName": keyword_list[int(place["placeName"])],"jianyi":place["解析"]} |
|||
for place in parsed_data |
|||
if place['回答'] == '是' |
|||
] |
|||
pprint(err) |
|||
# err = [place["placeName"]=keyword_list[int(place["placeName"])] for place in parsed_data if place['回答'] == '是'] |
|||
# if len(err) > 0: |
|||
# # for t in error_places: |
|||
# # keyword = t['placeName'] |
|||
# # # 查找包含关键字的段落 |
|||
# # paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', gettext) |
|||
# # t["yuanwen"] = paragraphs[0] |
|||
# return err |
|||
# else: |
|||
return err |
|||
|
|||
# from flask import Flask, request, jsonify |
|||
# import os |
|||
# # from checkPlaceName import checkPlaceName |
|||
# # from checkRepeatText import checkRepeatText |
|||
# # from checkCompanyName import checkCompanyName |
|||
# # from documentError import getDocumentError |
|||
# app = Flask(__name__) |
|||
# UPLOAD_FOLDER = 'uploads' |
|||
# if not os.path.exists(UPLOAD_FOLDER): |
|||
# os.makedirs(UPLOAD_FOLDER) |
|||
# @app.route('/upload', methods=['POST']) |
|||
# def upload_file(): |
|||
# if 'file' not in request.files: |
|||
# return jsonify({"error": "No file part"}), 400 |
|||
# file = request.files['file'] |
|||
# if file.filename == '': |
|||
# return jsonify({"error": "No selected file"}), 400 |
|||
# if file: |
|||
# filename = file.filename |
|||
# file.save(os.path.join(UPLOAD_FOLDER,filename)) |
|||
# return jsonify({"message": "File uploaded successfully"}), 200 |
|||
# # @app.route('/checkPlaceName/<filename>', methods=['GET']) |
|||
# # def checkPlaceNameWeb(filename): |
|||
# # return checkPlaceName(filename) |
|||
# # @app.route('/checkRepeatText/<filename>', methods=['GET']) |
|||
# # def checkRepeatTextWeb(filename): |
|||
# # return checkRepeatText(filename) |
|||
# # @app.route('/checkCompanyName/<filename>', methods=['GET']) |
|||
# # def checkCompanyNameWeb(filename): |
|||
# # return checkCompanyName(filename) |
|||
# # @app.route('/checkDocumentErrorWeb/<filename>', methods=['GET']) |
|||
# # def checkDocumentErrorWeb(filename): |
|||
# # return getDocumentError(filename) |
|||
# if __name__ == '__main__': |
|||
# app.run(host='0.0.0.0',port=80) |
|||
# from transformers import AutoTokenizer, AutoModel, GenerationConfig,AutoModelForCausalLM |
|||
# import os |
|||
# os.environ['NPU_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7' |
|||
# os.environ['ASCEND_RT_VISIBLE_DEVICES']='0,1,2,3,4,5,6,7' |
|||
# import torch |
|||
# import torch_npu |
|||
# from torch_npu.contrib import transfer_to_npu |
|||
|
|||
# from accelerate import Accelerator |
|||
|
|||
# # device = 'cpu' |
|||
# accelerator = Accelerator() |
|||
# # torch_device = "npu" # 0~7 |
|||
# # torch.npu.set_device(torch.device(torch_device)) |
|||
# devices = [] |
|||
# for i in range(8): |
|||
# devices.append(f"npu:{i}") |
|||
# print(devices) |
|||
# torch.npu.set_device(devices) |
|||
# torch.npu.set_compile_mode(jit_compile=False) |
|||
# model_name_or_path = '/mnt/sdc/qwen/Qwen2-72B-Instruct' |
|||
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False) |
|||
# # model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True, device_map="auto",torch_dtype=torch.float16) |
|||
# model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, device_map=accelerator,torch_dtype=torch.float16).npu().eval() |
@ -0,0 +1,153 @@ |
|||
from docx import Document |
|||
from paddlenlp import Taskflow |
|||
from pprint import pprint |
|||
from qwen_agent.agents import Assistant |
|||
import re |
|||
import json_repair |
|||
import time |
|||
tagTask = Taskflow("ner") |
|||
prompt=''' |
|||
.上述文本判断地名是否正确,你可以使用工具利用互联网查询,你只能在[正确,错误,简称,未知]三种选项中选择答案,回答格式[{“placeName”:“地名”,"回答":"答案"},{“placeName”:“地名”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
不做过多的解释,严格按回答格式作答; |
|||
''' |
|||
# prompt=''' |
|||
# .请回答以上问题, |
|||
# ,回答格式[{“placeName”:"原文","回答":"答案"},{“placeName”:"原文","回答":"答案"}],不做过多的解释,严格按回答格式作答; |
|||
# 不做过多的解释,严格按回答格式作答; |
|||
# ''' |
|||
llm_cfg = { |
|||
#'model': 'qwen1.5-72b-chat', |
|||
'model':"qwen2-72b", |
|||
'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base |
|||
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', |
|||
} |
|||
bot = Assistant(llm=llm_cfg, |
|||
name='Assistant', |
|||
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' |
|||
) |
|||
#获取全文内容 |
|||
def getDocxToTextAll(name): |
|||
docxPath=name |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
# print("非空") |
|||
words.append(text) |
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
print("placeNameTask",len(words)) |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkPlaceName.txt", 'w', encoding='utf-8') as txt_file: |
|||
txt_file.write(text) |
|||
|
|||
#得到全文和地名有关的内容 |
|||
def placeNameTask(text): |
|||
res = tagTask(text) |
|||
print(res) |
|||
placeList = [] |
|||
isplace = False |
|||
for zuhe in res: |
|||
# 上一个的地名,这一个还是地名,就和上一个相加代替这个 |
|||
|
|||
if isplace: |
|||
name = placeList[len(placeList) - 1] |
|||
if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0:# or zuhe[1] == "ns" |
|||
isplace = True |
|||
new_text = zuhe[0].replace("\n", "") |
|||
placeList[len(placeList) - 1] = name + new_text |
|||
continue |
|||
if zuhe[1].find("组织机构类")>=0 or zuhe[1].find("世界地区类")>=0: |
|||
isplace = True |
|||
new_text = zuhe[0].replace("\n", "") |
|||
placeList.append(new_text) |
|||
else: |
|||
isplace = False |
|||
placeList=list(dict.fromkeys(placeList)) |
|||
return placeList |
|||
#主方法 |
|||
def checkPlaceName(filename): |
|||
getDocxToTextAll(filename) |
|||
start_time=time.time() |
|||
error_places = [] |
|||
for batch in read_file_in_batches('checkPlaceName.txt'): |
|||
res=process_batch(batch) |
|||
if(len(res)>0): |
|||
error_places.extend(res) |
|||
|
|||
pprint(error_places) |
|||
end_time = time.time() |
|||
# 计算执行时间 |
|||
elapsed_time = end_time - start_time |
|||
print(f"checkPlaceName程序执行时间: {elapsed_time} 秒") |
|||
return error_places |
|||
|
|||
def read_file_in_batches(file_path, batch_size=5000): |
|||
""" |
|||
分批读取文本文件 |
|||
:param file_path: 文件路径 |
|||
:param batch_size: 每批处理的字符数 |
|||
:return: 生成器,每次返回一批文本 |
|||
""" |
|||
with open(file_path, 'r', encoding='utf-8') as file: |
|||
batch = [] |
|||
char_count = 0 |
|||
for line in file: |
|||
batch.append(line) |
|||
char_count += len(line) |
|||
if char_count >= batch_size: |
|||
yield ''.join(batch) |
|||
batch = [] |
|||
char_count = 0 |
|||
if batch: |
|||
yield ''.join(batch) |
|||
|
|||
def process_batch(batch): |
|||
""" |
|||
处理一批文本 |
|||
:param batch: 一批文本 |
|||
""" |
|||
# 在这里添加你的处理逻辑 |
|||
|
|||
# sentences = re.split(r'[。\n]', batch) |
|||
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
propnList=placeNameTask(batch) |
|||
# words=[] |
|||
# for placeName in propnList: |
|||
# word="原文:{},先从分析原文是否含有错误地名,若含有错误地名,请回答包含错误地名,若不包含错误地名,请从【具体的公司或组织名称,非具体的公司或组织名称,与政府有关的公司或组织名称,其他组织名称,地名】中选择最合适的一个作为答案".format(placeName) |
|||
# words.append(word) |
|||
propnStr = ",".join(propnList) |
|||
print("placeNameTask",propnStr) |
|||
messages = [{'role': 'user', 'content': [{'text': propnStr + prompt}]}] |
|||
runList = [] |
|||
for rsp in bot.run(messages): |
|||
runList.append(rsp) |
|||
data = runList[len(runList) - 1][0]["content"] |
|||
print("placeNameTask",data) |
|||
parsed_data = json_repair.loads(data.replace('`', '')) |
|||
|
|||
# 遍历列表 |
|||
for item in parsed_data: |
|||
print(f"地名: {item['placeName']}, 回答: {item['回答']}") |
|||
|
|||
# 如果需要进一步操作,例如只关注“正确”的回答 |
|||
error_places = [place for place in parsed_data if place['回答'] == '错误'] |
|||
print("placeNameTask",error_places) |
|||
if len(error_places)>0: |
|||
for t in error_places: |
|||
keyword= t['placeName'] |
|||
# 查找包含关键字的段落 |
|||
paragraphs = re.findall(r'.*?' + re.escape(keyword) + r'.*?\n', batch) |
|||
t["yuanwen"]=paragraphs[0] |
|||
return error_places |
|||
else: |
|||
return error_places |
@ -0,0 +1,160 @@ |
|||
import uuid |
|||
from langchain_chroma import Chroma |
|||
from langchain_community.embeddings import DashScopeEmbeddings |
|||
from langchain_community.document_loaders import TextLoader |
|||
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|||
|
|||
from paddlenlp import Taskflow |
|||
similarity = Taskflow("text_similarity" , truncation=True,max_length=102400) |
|||
embeddings = DashScopeEmbeddings(dashscope_api_key="sk-ea89cf04431645b185990b8af8c9bb13") |
|||
vector_store_path="vector_store" |
|||
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) |
|||
import re |
|||
import time |
|||
from docx import Document |
|||
|
|||
# 记录程序开始的时间戳 |
|||
def getOutlineLevel(inputXml): |
|||
""" |
|||
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number |
|||
参数 inputXml |
|||
返回 number |
|||
""" |
|||
start_index = inputXml.find('<w:outlineLvl') |
|||
end_index = inputXml.find('>', start_index) |
|||
number = inputXml[start_index:end_index + 1] |
|||
number = re.search("\d+", number).group() |
|||
return number |
|||
|
|||
|
|||
def isTitle(paragraph): |
|||
""" |
|||
功能 判断该段落是否设置了大纲等级 |
|||
参数 paragraph:段落 |
|||
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 |
|||
""" |
|||
# 如果是空行,直接返回None |
|||
if paragraph.text.strip() == '': |
|||
return None |
|||
|
|||
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 |
|||
paragraphXml = paragraph._p.xml |
|||
if paragraphXml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(paragraphXml) |
|||
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 |
|||
targetStyle = paragraph.style |
|||
while targetStyle is not None: |
|||
# 如果在该级style中找到了大纲级别,返回 |
|||
if targetStyle.element.xml.find('<w:outlineLvl') >= 0: |
|||
return getOutlineLevel(targetStyle.element.xml) |
|||
else: |
|||
targetStyle = targetStyle.base_style |
|||
# 如果在段落、样式里都没有找到大纲级别,返回None |
|||
return None |
|||
|
|||
#获取文档中 详细设计方案 章节的所有内容 |
|||
def getDocxToText(docxPath,titleName): |
|||
document = Document(docxPath) |
|||
# 逐段读取docx文档的内容 |
|||
levelList=[] |
|||
words=[] |
|||
addStart = False |
|||
levelText="" |
|||
i = 0 |
|||
for paragraph in document.paragraphs: |
|||
# 判断该段落的标题级别 |
|||
# 这里用isTitle()临时代表,具体见下文介绍的方法 |
|||
text = paragraph.text |
|||
if text.strip():#非空判断 |
|||
print("非空") |
|||
if titleName: |
|||
level = isTitle(paragraph) |
|||
if(addStart and level=="0"): |
|||
addStart=False |
|||
if(level=="0" and text.find(titleName)>=0): |
|||
addStart=True |
|||
if level: |
|||
levelList.append("{}:".format(level)+paragraph.text) |
|||
levelText=text |
|||
else: |
|||
if addStart: |
|||
if(text.startswith("图") or text.startswith("注:")): |
|||
continue |
|||
i=i+1 |
|||
words.append("第{}个段落:".format(i)+text) |
|||
else: |
|||
words.append(text) |
|||
|
|||
# 将所有段落文本拼接成一个字符串,并用换行符分隔 |
|||
print("checkRepeatText",len(words)) |
|||
if len(words)==0: |
|||
raise Exception("I know python!") |
|||
text = '\n'.join(words) |
|||
|
|||
# 将文本写入txt文件 |
|||
with open("checkRepeatText.txt", 'w', ) as txt_file: |
|||
txt_file.write(text) |
|||
time.sleep(3) |
|||
loader = TextLoader(file_path='checkRepeatText.txt') |
|||
docs = loader.load() |
|||
# print(docs) |
|||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10, add_start_index=True, |
|||
separators=["\n\n", "\n"]) |
|||
splits = text_splitter.split_documents(docs) |
|||
uuids = [] |
|||
print(len(splits)) |
|||
for i in range(len(splits)): |
|||
uuids.append(str(uuid.uuid4())) |
|||
print(len(uuids)) |
|||
vectorstore = Chroma(persist_directory=vector_store_path, embedding_function=embeddings) |
|||
vectorstore.add_documents(documents=splits, ids=uuids) |
|||
while True: |
|||
time.sleep(0.3) |
|||
ress = vectorstore.similarity_search(words[0]) |
|||
if (len(ress) > 0): |
|||
break |
|||
return words,uuids |
|||
|
|||
|
|||
# @app.route('/checkRepeatText/<filename>', methods=['GET']) |
|||
def checkRepeatText(filename,titleName): |
|||
words,uuids=getDocxToText(filename,titleName) |
|||
try: |
|||
# 记录程序开始的时间戳‘ |
|||
reslist = [] |
|||
count = 0 |
|||
for i in words: |
|||
count += 1 |
|||
result = vectorstore.similarity_search(i) |
|||
textTag = i.split(":")[0] |
|||
print(i) |
|||
for content in result: |
|||
text = content.page_content |
|||
tag = text.split(":")[0].replace('\n', '') |
|||
if (textTag.find(tag) >= 0): |
|||
continue |
|||
res = similarity([[i[i.find(':') + 1:], text[text.find(':') + 1:]]]) |
|||
print(res[0]["similarity"]) |
|||
if (res[0]["similarity"] > 0.95): |
|||
# 判断重复内容是否被放入 |
|||
if (len(reslist) > 0): |
|||
isExist = False |
|||
for neirong in reslist: |
|||
if i[i.find(':') + 1:] in neirong.values(): |
|||
isExist = True |
|||
break |
|||
if not isExist: |
|||
reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:]}) |
|||
print(reslist) |
|||
else: |
|||
reslist.append({"yuanwen1":i[i.find(':') + 1:],"yuanwen2":text[text.find(':') + 1:]}) |
|||
print(i.split(":")[1] + "\n" + text.split(":")[1]) |
|||
except Exception as e: |
|||
print("发生异常:",e) |
|||
finally: |
|||
# if(count>=300): |
|||
# break |
|||
vectorstore.delete(ids=uuids) |
|||
print("已删除") |
|||
print(reslist) |
|||
return reslist |
@ -0,0 +1,712 @@ |
|||
""" |
|||
This module will parse the JSON file following the BNF definition: |
|||
|
|||
<json> ::= <container> |
|||
|
|||
<primitive> ::= <number> | <string> | <boolean> |
|||
; Where: |
|||
; <number> is a valid real number expressed in one of a number of given formats |
|||
; <string> is a string of valid characters enclosed in quotes |
|||
; <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
|||
|
|||
<container> ::= <object> | <array> |
|||
<array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
|||
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
|||
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value |
|||
|
|||
If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string: |
|||
- Add the missing parentheses if the parser believes that the array or object should be closed |
|||
- Quote strings or add missing single quotes |
|||
- Adjust whitespaces and remove line breaks |
|||
|
|||
All supported use cases are in the unit tests |
|||
""" |
|||
|
|||
import os |
|||
import json |
|||
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal |
|||
|
|||
|
|||
class StringFileWrapper: |
|||
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling |
|||
def __init__(self, fd: TextIO) -> None: |
|||
self.fd = fd |
|||
self.length: int = 0 |
|||
|
|||
def __getitem__(self, index: Union[int, slice]) -> str: |
|||
if isinstance(index, slice): |
|||
self.fd.seek(index.start) |
|||
value = self.fd.read(index.stop - index.start) |
|||
self.fd.seek(index.start) |
|||
return value |
|||
else: |
|||
self.fd.seek(index) |
|||
return self.fd.read(1) |
|||
|
|||
def __len__(self) -> int: |
|||
if self.length < 1: |
|||
current_position = self.fd.tell() |
|||
self.fd.seek(0, os.SEEK_END) |
|||
self.length = self.fd.tell() |
|||
self.fd.seek(current_position) |
|||
return self.length |
|||
|
|||
|
|||
class LoggerConfig: |
|||
# This is a type class to simplify the declaration |
|||
def __init__(self, log_level: Optional[str]): |
|||
self.log: List[Dict[str, str]] = [] |
|||
self.window: int = 10 |
|||
self.log_level: str = log_level if log_level else "none" |
|||
|
|||
|
|||
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None] |
|||
|
|||
|
|||
class JSONParser: |
|||
def __init__( |
|||
self, |
|||
json_str: Union[str, StringFileWrapper], |
|||
json_fd: Optional[TextIO], |
|||
logging: Optional[bool], |
|||
) -> None: |
|||
# The string to parse |
|||
self.json_str = json_str |
|||
# Alternatively, the file description with a json file in it |
|||
if json_fd: |
|||
# This is a trick we do to treat the file wrapper as an array |
|||
self.json_str = StringFileWrapper(json_fd) |
|||
# Index is our iterator that will keep track of which character we are looking at right now |
|||
self.index: int = 0 |
|||
# This is used in the object member parsing to manage the special cases of missing quotes in key or value |
|||
self.context: list[str] = [] |
|||
# Use this to log the activity, but only if logging is active |
|||
self.logger = LoggerConfig(log_level="info" if logging else None) |
|||
|
|||
def parse( |
|||
self, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
json = self.parse_json() |
|||
if self.index < len(self.json_str): |
|||
self.log( |
|||
"The parser returned early, checking if there's more json elements", |
|||
"info", |
|||
) |
|||
json = [json] |
|||
last_index = self.index |
|||
while self.index < len(self.json_str): |
|||
j = self.parse_json() |
|||
if j != "": |
|||
json.append(j) |
|||
if self.index == last_index: |
|||
self.index += 1 |
|||
last_index = self.index |
|||
# If nothing extra was found, don't return an array |
|||
if len(json) == 1: |
|||
self.log( |
|||
"There were no more elements, returning the element without the array", |
|||
"info", |
|||
) |
|||
json = json[0] |
|||
if self.logger.log_level == "none": |
|||
return json |
|||
else: |
|||
return json, self.logger.log |
|||
|
|||
def parse_json( |
|||
self, |
|||
) -> JSONReturnType: |
|||
while True: |
|||
char = self.get_char_at() |
|||
# This parser will ignore any basic element (string or number) that is not inside an array or object |
|||
is_in_context = len(self.context) > 0 |
|||
# False means that we are at the end of the string provided |
|||
if char is False: |
|||
return "" |
|||
# <object> starts with '{' |
|||
elif char == "{": |
|||
self.index += 1 |
|||
return self.parse_object() |
|||
# <array> starts with '[' |
|||
elif char == "[": |
|||
self.index += 1 |
|||
return self.parse_array() |
|||
# there can be an edge case in which a key is empty and at the end of an object |
|||
# like "key": }. We return an empty string here to close the object properly |
|||
elif char == "}": |
|||
self.log( |
|||
"At the end of an object we found a key with missing value, skipping", |
|||
"info", |
|||
) |
|||
return "" |
|||
# <string> starts with a quote |
|||
elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()): |
|||
return self.parse_string() |
|||
# <number> starts with [0-9] or minus |
|||
elif is_in_context and (char.isdigit() or char == "-" or char == "."): |
|||
return self.parse_number() |
|||
# If everything else fails, we just ignore and move on |
|||
else: |
|||
self.index += 1 |
|||
|
|||
def parse_object(self) -> Dict[str, Any]: |
|||
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members' |
|||
obj = {} |
|||
# Stop when you either find the closing parentheses or you have iterated over the entire string |
|||
while (self.get_char_at() or "}") != "}": |
|||
# This is what we expect to find: |
|||
# <member> ::= <string> ': ' <json> |
|||
|
|||
# Skip filler whitespaces |
|||
self.skip_whitespaces_at() |
|||
|
|||
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on |
|||
if (self.get_char_at() or "") == ":": |
|||
self.log( |
|||
"While parsing an object we found a : before a key, ignoring", |
|||
"info", |
|||
) |
|||
self.index += 1 |
|||
|
|||
# We are now searching for they string key |
|||
# Context is used in the string parser to manage the lack of quotes |
|||
self.set_context("object_key") |
|||
|
|||
self.skip_whitespaces_at() |
|||
|
|||
# <member> starts with a <string> |
|||
key = "" |
|||
while self.get_char_at(): |
|||
key = str(self.parse_string()) |
|||
|
|||
if key != "" or (key == "" and self.get_char_at() == ":"): |
|||
# If the string is empty but there is a object divider, we are done here |
|||
break |
|||
|
|||
self.skip_whitespaces_at() |
|||
|
|||
# We reached the end here |
|||
if (self.get_char_at() or "}") == "}": |
|||
continue |
|||
|
|||
self.skip_whitespaces_at() |
|||
|
|||
# An extreme case of missing ":" after a key |
|||
if (self.get_char_at() or "") != ":": |
|||
self.log( |
|||
"While parsing an object we missed a : after a key", |
|||
"info", |
|||
) |
|||
|
|||
self.index += 1 |
|||
self.reset_context() |
|||
self.set_context("object_value") |
|||
# The value can be any valid json |
|||
value = self.parse_json() |
|||
|
|||
# Reset context since our job is done |
|||
self.reset_context() |
|||
obj[key] = value |
|||
|
|||
if (self.get_char_at() or "") in [",", "'", '"']: |
|||
self.index += 1 |
|||
|
|||
# Remove trailing spaces |
|||
self.skip_whitespaces_at() |
|||
|
|||
self.index += 1 |
|||
return obj |
|||
|
|||
def parse_array(self) -> List[Any]: |
|||
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas |
|||
arr = [] |
|||
self.set_context("array") |
|||
# Stop when you either find the closing parentheses or you have iterated over the entire string |
|||
while (self.get_char_at() or "]") != "]": |
|||
self.skip_whitespaces_at() |
|||
value = self.parse_json() |
|||
|
|||
# It is possible that parse_json() returns nothing valid, so we stop |
|||
if value == "": |
|||
break |
|||
|
|||
if value == "..." and self.get_char_at(-1) == ".": |
|||
self.log( |
|||
"While parsing an array, found a stray '...'; ignoring it", "info" |
|||
) |
|||
else: |
|||
arr.append(value) |
|||
|
|||
# skip over whitespace after a value but before closing ] |
|||
char = self.get_char_at() |
|||
while char and (char.isspace() or char == ","): |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
|
|||
# Especially at the end of an LLM generated json you might miss the last "]" |
|||
char = self.get_char_at() |
|||
if char and char != "]": |
|||
self.log( |
|||
"While parsing an array we missed the closing ], adding it back", "info" |
|||
) |
|||
self.index -= 1 |
|||
|
|||
self.index += 1 |
|||
self.reset_context() |
|||
return arr |
|||
|
|||
def parse_string(self) -> Union[str, bool, None]: |
|||
# <string> is a string of valid characters enclosed in quotes |
|||
# i.e. { name: "John" } |
|||
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here |
|||
|
|||
# Flag to manage corner cases related to missing starting quote |
|||
missing_quotes = False |
|||
doubled_quotes = False |
|||
lstring_delimiter = rstring_delimiter = '"' |
|||
|
|||
char = self.get_char_at() |
|||
# A valid string can only start with a valid quote or, in our case, with a literal |
|||
while char and char not in ['"', "'", "“"] and not char.isalnum(): |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
|
|||
if not char: |
|||
# This is an empty string |
|||
return "" |
|||
|
|||
# Ensuring we use the right delimiter |
|||
if char == "'": |
|||
lstring_delimiter = rstring_delimiter = "'" |
|||
elif char == "“": |
|||
lstring_delimiter = "“" |
|||
rstring_delimiter = "”" |
|||
elif char.isalnum(): |
|||
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid |
|||
# But remember, object keys are only of type string |
|||
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key": |
|||
value = self.parse_boolean_or_null() |
|||
if value != "": |
|||
return value |
|||
self.log( |
|||
"While parsing a string, we found a literal instead of a quote", |
|||
"info", |
|||
) |
|||
self.log( |
|||
"While parsing a string, we found no starting quote. Will add the quote back", |
|||
"info", |
|||
) |
|||
missing_quotes = True |
|||
|
|||
if not missing_quotes: |
|||
self.index += 1 |
|||
|
|||
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop |
|||
if self.get_char_at() == lstring_delimiter: |
|||
# If it's an empty key, this was easy |
|||
if self.get_context() == "object_key" and self.get_char_at(1) == ":": |
|||
self.index += 1 |
|||
return "" |
|||
# Find the next delimiter |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# Now check that the next character is also a delimiter to ensure that we have ""....."" |
|||
# In that case we ignore this rstring delimiter |
|||
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter: |
|||
self.log( |
|||
"While parsing a string, we found a valid starting doubled quote, ignoring it", |
|||
"info", |
|||
) |
|||
doubled_quotes = True |
|||
self.index += 1 |
|||
else: |
|||
# Ok this is not a doubled quote, check if this is an empty string or not |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c not in [",", "]", "}"]: |
|||
self.log( |
|||
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote", |
|||
"info", |
|||
) |
|||
self.index += 1 |
|||
|
|||
# Initialize our return value |
|||
string_acc = "" |
|||
|
|||
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object |
|||
# In that case we need to use the ":|,|}" characters as terminators of the string |
|||
# So this will stop if: |
|||
# * It finds a closing quote |
|||
# * It iterated over the entire sequence |
|||
# * If we are fixing missing quotes in an object, when it finds the special terminators |
|||
char = self.get_char_at() |
|||
while char and char != rstring_delimiter: |
|||
if missing_quotes: |
|||
if self.get_context() == "object_key" and ( |
|||
char == ":" or char.isspace() |
|||
): |
|||
self.log( |
|||
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here", |
|||
"info", |
|||
) |
|||
break |
|||
elif self.get_context() == "object_value" and char in [",", "}"]: |
|||
rstring_delimiter_missing = True |
|||
# check if this is a case in which the closing comma is NOT missing instead |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c and next_c in [",", "}"]: |
|||
rstring_delimiter_missing = False |
|||
if rstring_delimiter_missing: |
|||
self.log( |
|||
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here", |
|||
"info", |
|||
) |
|||
break |
|||
string_acc += char |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
if char and len(string_acc) > 0 and string_acc[-1] == "\\": |
|||
# This is a special case, if people use real strings this might happen |
|||
self.log("Found a stray escape sequence, normalizing it", "info") |
|||
string_acc = string_acc[:-1] |
|||
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]: |
|||
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"} |
|||
string_acc += escape_seqs.get(char, char) or char |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here |
|||
if char == rstring_delimiter: |
|||
# Special case here, in case of double quotes one after another |
|||
if doubled_quotes and self.get_char_at(1) == rstring_delimiter: |
|||
self.log( |
|||
"While parsing a string, we found a doubled quote, ignoring it", |
|||
"info", |
|||
) |
|||
self.index += 1 |
|||
elif missing_quotes and self.get_context() == "object_value": |
|||
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c not in [ |
|||
rstring_delimiter, |
|||
lstring_delimiter, |
|||
]: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c: |
|||
# We found a quote, now let's make sure there's a ":" following |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# found a delimiter, now we need to check that is followed strictly by a comma or brace |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c and next_c == ":": |
|||
# Reset the cursor |
|||
self.index -= 1 |
|||
char = self.get_char_at() |
|||
self.log( |
|||
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.", |
|||
"info", |
|||
) |
|||
break |
|||
else: |
|||
# Check if eventually there is a rstring delimiter, otherwise we bail |
|||
i = 1 |
|||
next_c = self.get_char_at(i) |
|||
check_comma_in_object_value = True |
|||
while next_c and next_c not in [ |
|||
rstring_delimiter, |
|||
lstring_delimiter, |
|||
]: |
|||
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas |
|||
# This is because the routine after will make sure to correct any bad guess and this solves a corner case |
|||
if check_comma_in_object_value and next_c.isalpha(): |
|||
check_comma_in_object_value = False |
|||
# If we are in an object context, let's check for the right delimiters |
|||
if ( |
|||
("object_key" in self.context and next_c in [":", "}"]) |
|||
or ("object_value" in self.context and next_c == "}") |
|||
or ("array" in self.context and next_c in ["]", ","]) |
|||
or ( |
|||
check_comma_in_object_value |
|||
and self.get_context() == "object_value" |
|||
and next_c == "," |
|||
) |
|||
): |
|||
break |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string |
|||
if next_c == "," and self.get_context() == "object_value": |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c.isspace(): |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
if next_c == "}": |
|||
# OK this is valid then |
|||
self.log( |
|||
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it", |
|||
"info", |
|||
) |
|||
string_acc += str(char) |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
elif next_c == rstring_delimiter: |
|||
if self.get_context() == "object_value": |
|||
# But this might not be it! This could be just a missing comma |
|||
# We found a delimiter and we need to check if this is a key |
|||
# so find a rstring_delimiter and a colon after |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != rstring_delimiter: |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
while next_c and next_c != ":": |
|||
if next_c in [ |
|||
lstring_delimiter, |
|||
rstring_delimiter, |
|||
",", |
|||
]: |
|||
break |
|||
i += 1 |
|||
next_c = self.get_char_at(i) |
|||
# Only if we fail to find a ':' then we know this is misplaced quote |
|||
if next_c != ":": |
|||
self.log( |
|||
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it", |
|||
"info", |
|||
) |
|||
string_acc += str(char) |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
|
|||
if ( |
|||
char |
|||
and missing_quotes |
|||
and self.get_context() == "object_key" |
|||
and char.isspace() |
|||
): |
|||
self.log( |
|||
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value", |
|||
"info", |
|||
) |
|||
self.skip_whitespaces_at() |
|||
if self.get_char_at() not in [":", ","]: |
|||
return "" |
|||
|
|||
# A fallout of the previous special case in the while loop, |
|||
# we need to update the index only if we had a closing quote |
|||
if char != rstring_delimiter: |
|||
self.log( |
|||
"While parsing a string, we missed the closing quote, ignoring", |
|||
"info", |
|||
) |
|||
else: |
|||
self.index += 1 |
|||
|
|||
return string_acc.rstrip() |
|||
|
|||
def parse_number(self) -> Union[float, int, str, JSONReturnType]: |
|||
# <number> is a valid real number expressed in one of a number of given formats |
|||
number_str = "" |
|||
number_chars = set("0123456789-.eE/,") |
|||
char = self.get_char_at() |
|||
is_array = self.get_context() == "array" |
|||
while char and char in number_chars and (char != "," or not is_array): |
|||
number_str += char |
|||
self.index += 1 |
|||
char = self.get_char_at() |
|||
if len(number_str) > 1 and number_str[-1] in "-eE/,": |
|||
# The number ends with a non valid character for a number/currency, rolling back one |
|||
number_str = number_str[:-1] |
|||
self.index -= 1 |
|||
try: |
|||
if "," in number_str: |
|||
return str(number_str) |
|||
if "." in number_str or "e" in number_str or "E" in number_str: |
|||
return float(number_str) |
|||
elif number_str == "-": |
|||
# If there is a stray "-" this will throw an exception, throw away this character |
|||
return self.parse_json() |
|||
else: |
|||
return int(number_str) |
|||
except ValueError: |
|||
return number_str |
|||
|
|||
def parse_boolean_or_null(self) -> Union[bool, str, None]: |
|||
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted) |
|||
starting_index = self.index |
|||
char = (self.get_char_at() or "").lower() |
|||
value: Optional[Tuple[str, Optional[bool]]] |
|||
if char == "t": |
|||
value = ("true", True) |
|||
elif char == "f": |
|||
value = ("false", False) |
|||
elif char == "n": |
|||
value = ("null", None) |
|||
|
|||
if value: |
|||
i = 0 |
|||
while char and i < len(value[0]) and char == value[0][i]: |
|||
i += 1 |
|||
self.index += 1 |
|||
char = (self.get_char_at() or "").lower() |
|||
if i == len(value[0]): |
|||
return value[1] |
|||
|
|||
# If nothing works reset the index before returning |
|||
self.index = starting_index |
|||
return "" |
|||
|
|||
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]: |
|||
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True |
|||
try: |
|||
return self.json_str[self.index + count] |
|||
except IndexError: |
|||
return False |
|||
|
|||
def skip_whitespaces_at(self) -> None: |
|||
""" |
|||
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise |
|||
""" |
|||
try: |
|||
char = self.json_str[self.index] |
|||
except IndexError: |
|||
return |
|||
while char.isspace(): |
|||
self.index += 1 |
|||
try: |
|||
char = self.json_str[self.index] |
|||
except IndexError: |
|||
return |
|||
|
|||
def set_context(self, value: str) -> None: |
|||
# If a value is provided update the context variable and save in stack |
|||
if value: |
|||
self.context.append(value) |
|||
|
|||
def reset_context(self) -> None: |
|||
self.context.pop() |
|||
|
|||
def get_context(self) -> str: |
|||
return self.context[-1] |
|||
|
|||
def log(self, text: str, level: str) -> None: |
|||
if level == self.logger.log_level: |
|||
context = "" |
|||
start = max(self.index - self.logger.window, 0) |
|||
end = min(self.index + self.logger.window, len(self.json_str)) |
|||
context = self.json_str[start:end] |
|||
self.logger.log.append( |
|||
{ |
|||
"text": text, |
|||
"context": context, |
|||
} |
|||
) |
|||
|
|||
|
|||
def repair_json( |
|||
json_str: str = "", |
|||
return_objects: bool = False, |
|||
skip_json_loads: bool = False, |
|||
logging: bool = False, |
|||
json_fd: Optional[TextIO] = None, |
|||
ensure_ascii: bool = True, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it. |
|||
It will return the fixed string by default. |
|||
When `return_objects=True` is passed, it will return the decoded data structure instead. |
|||
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function |
|||
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions |
|||
""" |
|||
parser = JSONParser(json_str, json_fd, logging) |
|||
if skip_json_loads: |
|||
parsed_json = parser.parse() |
|||
else: |
|||
try: |
|||
if json_fd: |
|||
parsed_json = json.load(json_fd) |
|||
else: |
|||
parsed_json = json.loads(json_str) |
|||
except json.JSONDecodeError: |
|||
parsed_json = parser.parse() |
|||
# It's useful to return the actual object instead of the json string, |
|||
# it allows this lib to be a replacement of the json library |
|||
if return_objects or logging: |
|||
return parsed_json |
|||
return json.dumps(parsed_json, ensure_ascii=ensure_ascii) |
|||
|
|||
|
|||
def loads( |
|||
json_str: str, |
|||
skip_json_loads: bool = False, |
|||
logging: bool = False, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
This function works like `json.loads()` except that it will fix your JSON in the process. |
|||
It is a wrapper around the `repair_json()` function with `return_objects=True`. |
|||
""" |
|||
return repair_json( |
|||
json_str=json_str, |
|||
return_objects=True, |
|||
skip_json_loads=skip_json_loads, |
|||
logging=logging, |
|||
) |
|||
|
|||
|
|||
def load( |
|||
fd: TextIO, skip_json_loads: bool = False, logging: bool = False |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
This function works like `json.load()` except that it will fix your JSON in the process. |
|||
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`. |
|||
""" |
|||
return repair_json( |
|||
json_fd=fd, |
|||
return_objects=True, |
|||
skip_json_loads=skip_json_loads, |
|||
logging=logging, |
|||
) |
|||
|
|||
|
|||
def from_file( |
|||
filename: str, |
|||
skip_json_loads: bool = False, |
|||
logging: bool = False, |
|||
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: |
|||
""" |
|||
This function is a wrapper around `load()` so you can pass the filename as string |
|||
""" |
|||
fd = open(filename) |
|||
jsonobj = load(fd, skip_json_loads, logging) |
|||
fd.close() |
|||
|
|||
return jsonobj |
@ -0,0 +1,45 @@ |
|||
from flask import Flask, request, jsonify |
|||
import os |
|||
from checkPlaceName import checkPlaceName |
|||
# from checkRepeatText import checkRepeatText |
|||
from checkCompanyName import checkCompanyName |
|||
from checkDocumentError import getDocumentError |
|||
app = Flask(__name__) |
|||
UPLOAD_FOLDER = 'uploads' |
|||
if not os.path.exists(UPLOAD_FOLDER): |
|||
os.makedirs(UPLOAD_FOLDER) |
|||
@app.route('/upload', methods=['POST']) |
|||
def upload_file(): |
|||
if 'file' not in request.files: |
|||
return jsonify({"error": "No file part"}), 400 |
|||
file = request.files['file'] |
|||
if file.filename == '': |
|||
return jsonify({"error": "No selected file"}), 400 |
|||
if file: |
|||
filename = file.filename |
|||
file.save(os.path.join(UPLOAD_FOLDER,filename)) |
|||
return jsonify({"message": "File uploaded successfully"}), 200 |
|||
|
|||
@app.route('/getDocumentError', methods=['GET']) |
|||
def getDocumentErrorWeb(): |
|||
filename = request.args.get('filename') |
|||
return getDocumentError(filename) |
|||
@app.route('/checkPlaceName', methods=['GET']) |
|||
def checkPlaceNameWeb(): |
|||
filename = request.args.get('filename') |
|||
return checkPlaceName(filename) |
|||
@app.route('/checkRepeatText', methods=['GET']) |
|||
def checkRepeatTextWeb(): |
|||
filename = request.args.get('filename') |
|||
sectionName=request.args.get('sectionName') |
|||
return checkRepeatText(filename,sectionName) |
|||
@app.route('/checkCompanyName', methods=['GET']) |
|||
def checkCompanyNameWeb(): |
|||
filename = request.args.get('filename') |
|||
return checkCompanyName(filename) |
|||
|
|||
@app.route('/test/<filename>', methods=['GET']) |
|||
def test(filename): |
|||
return filename |
|||
if __name__ == '__main__': |
|||
app.run(host="0.0.0.0",port=80) |