You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
283 lines
13 KiB
283 lines
13 KiB
5 months ago
|
import re
|
||
|
import os
|
||
|
import docx
|
||
|
from docx.document import Document
|
||
|
from docx.text.paragraph import Paragraph
|
||
|
from docx.parts.image import ImagePart
|
||
|
from qwen_agent.agents import Assistant
|
||
|
|
||
|
from docx.oxml.table import CT_Tbl
|
||
|
from docx.oxml.text.paragraph import CT_P
|
||
|
|
||
|
import shutil
|
||
|
import re
|
||
|
import json_repair
|
||
|
import uuid
|
||
|
|
||
|
# 记录程序开始的时间戳
|
||
|
def getOutlineLevel(inputXml):
|
||
|
"""
|
||
|
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
|
||
|
参数 inputXml
|
||
|
返回 number
|
||
|
"""
|
||
|
start_index = inputXml.find('<w:outlineLvl')
|
||
|
end_index = inputXml.find('>', start_index)
|
||
|
number = inputXml[start_index:end_index + 1]
|
||
|
number = re.search("\d+", number).group()
|
||
|
return number
|
||
|
|
||
|
|
||
|
def isTitle(paragraph):
|
||
|
"""
|
||
|
功能 判断该段落是否设置了大纲等级
|
||
|
参数 paragraph:段落
|
||
|
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题
|
||
|
"""
|
||
|
# 如果是空行,直接返回None
|
||
|
if paragraph.text.strip() == '':
|
||
|
return None
|
||
|
|
||
|
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
|
||
|
paragraphXml = paragraph._p.xml
|
||
|
if paragraphXml.find('<w:outlineLvl') >= 0:
|
||
|
return getOutlineLevel(paragraphXml)
|
||
|
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
|
||
|
targetStyle = paragraph.style
|
||
|
while targetStyle is not None:
|
||
|
# 如果在该级style中找到了大纲级别,返回
|
||
|
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
|
||
|
return getOutlineLevel(targetStyle.element.xml)
|
||
|
else:
|
||
|
targetStyle = targetStyle.base_style
|
||
|
# 如果在段落、样式里都没有找到大纲级别,返回None
|
||
|
return None
|
||
|
|
||
|
|
||
|
# 该行只能有一个图片
|
||
|
def is_image(graph: Paragraph, doc: Document):
|
||
|
images = graph._element.xpath('.//pic:pic') # 获取所有图片
|
||
|
for image in images:
|
||
|
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
|
||
|
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
|
||
|
if isinstance(part, ImagePart):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
|
||
|
# 获取图片(该行只能有一个图片)
|
||
|
def get_ImagePart(graph: Paragraph, doc: Document):
|
||
|
images = graph._element.xpath('.//pic:pic') # 获取所有图片
|
||
|
for image in images:
|
||
|
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
|
||
|
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
|
||
|
if isinstance(part, ImagePart):
|
||
|
return part
|
||
|
return None
|
||
|
#寻找标题名称
|
||
|
def findTitleName(docxPath):
|
||
|
yield '文档图片信息检查----检查是否存在详细设计方案'
|
||
|
document = docx.Document(docxPath)
|
||
|
# 逐段读取docx文档的内容
|
||
|
titleWords=[]
|
||
|
firstTitle = 0
|
||
|
secondTitle = 0
|
||
|
sanjiTitle = 0
|
||
|
for paragraph in document.paragraphs:
|
||
|
# 判断该段落的标题级别
|
||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法
|
||
|
text = paragraph.text
|
||
|
if text.strip():#非空判断
|
||
|
level = isTitle(paragraph)
|
||
|
if level=="0":
|
||
|
firstTitle+=1
|
||
|
secondTitle = 0
|
||
|
if(text.find("附件")>=0):
|
||
|
continue
|
||
|
titleWords.append("一级标题:".format(firstTitle)+text)
|
||
|
elif level=="1":
|
||
|
secondTitle+=1
|
||
|
sanjiTitle=0
|
||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
|
||
|
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
|
||
|
elif level=="2":
|
||
|
sanjiTitle += 1
|
||
|
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
|
||
|
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
|
||
|
findTitleName_llm_cfg = {
|
||
|
# 'model':"qwen2-72b",
|
||
|
# 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
|
||
|
'model': "qwen2-72b-instruct",
|
||
|
'model_server': 'DashScope', # base_url, also known as api_base
|
||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
|
||
|
}
|
||
|
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
|
||
|
name='Assistant',
|
||
|
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
|
||
|
)
|
||
|
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
|
||
|
类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择
|
||
|
类似设计方案,服务方案,建设方案为次相关,次级选择
|
||
|
类似方案是最后选择
|
||
|
按照这样的顺序选择最合适的
|
||
|
你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答
|
||
|
'''
|
||
|
# print("\n".join(titleWords)+prompt)
|
||
|
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
|
||
|
runList=[]
|
||
|
for rsp in findTitleName_bot.run(messages):
|
||
|
runList.append(rsp)
|
||
|
data = runList[len(runList) - 1][0]["content"]
|
||
|
parsed_data = json_repair.loads(data.replace('`', ''))
|
||
|
print(parsed_data)
|
||
|
if(parsed_data["answer"]=="存在"):
|
||
|
print("存在",parsed_data["name"])
|
||
|
yield parsed_data["name"]
|
||
|
else:
|
||
|
print("不存在",parsed_data["name"])
|
||
|
yield "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
|
||
|
def saveImage(fileName,titleName,imagePath):
|
||
|
fristName=""
|
||
|
doc = docx.Document(fileName)
|
||
|
for paragraph in doc.paragraphs:
|
||
|
# 判断该段落的标题级别
|
||
|
# 这里用isTitle()临时代表,具体见下文介绍的方法
|
||
|
text = paragraph.text
|
||
|
if text.strip(): # 非空判断
|
||
|
level = isTitle(paragraph)
|
||
|
if level == "0":
|
||
|
fristName = text
|
||
|
print(text)
|
||
|
if level:
|
||
|
levelText = f"{int(level) + 1}级标题-" + text
|
||
|
else:
|
||
|
# 空说明是表格或者图片
|
||
|
r = is_image(paragraph, doc)
|
||
|
if r and fristName == titleName:
|
||
|
part = get_ImagePart(paragraph, doc)
|
||
|
img_name = levelText+"_"+ os.path.basename(part.partname)
|
||
|
with open(f'{imagePath}/{img_name}', "wb") as f:
|
||
|
f.write(part.blob)
|
||
|
#保存完成后,上传大模型进行分析
|
||
|
def checkImageText(filename):
|
||
|
llm_cfg_vl = {
|
||
|
#'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
|
||
|
'model':"qwen-vl-max",
|
||
|
'model_server': 'DashScope', # base_url, also known as api_base
|
||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
|
||
|
}
|
||
|
botImage = Assistant(llm=llm_cfg_vl,
|
||
|
name='Assistant',
|
||
|
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
|
||
|
)
|
||
|
llm_cfg = {
|
||
|
#'model': 'qwen1.5-72b-chat',
|
||
|
'model':"qwen2-72b-instruct",
|
||
|
'model_server': 'DashScope', # base_url, also known as api_base
|
||
|
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
|
||
|
}
|
||
|
bot = Assistant(llm=llm_cfg,
|
||
|
name='Assistant',
|
||
|
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
|
||
|
|
||
|
)
|
||
|
for titleName in findTitleName(filename):
|
||
|
yield titleName
|
||
|
if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"):
|
||
|
yield "文档图片信息检查----文档内容解析中"
|
||
|
imagePath = "Image" + str(uuid.uuid4())
|
||
|
os.mkdir(imagePath)
|
||
|
saveImage(filename,titleName,imagePath)
|
||
|
imagePathList = os.listdir(imagePath)
|
||
|
count = 0
|
||
|
resMap={}
|
||
|
for image in imagePathList:
|
||
|
count+=1
|
||
|
yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}"
|
||
|
outpath=os.path.join("imagePath", image)
|
||
|
print(outpath)
|
||
|
messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
|
||
|
runListImage = []
|
||
|
for rsp in botImage.run(messagesImage):
|
||
|
runListImage.append(rsp)
|
||
|
data = runListImage[len(runListImage) - 1][0]["content"]
|
||
|
print(str(data))
|
||
|
prompt='''
|
||
|
依次上述内容是否与文档有关,你只能在[无关,有关]选项中选择答案,
|
||
|
按照这样的格式回答[{“text”:“内容”,"answer":"答案"},{“text”:“内容”,"answer":"答案"}]不做过多的解释,严格按回答格式作答
|
||
|
'''
|
||
|
messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}]
|
||
|
runList = []
|
||
|
for rsp in bot.run(messages):
|
||
|
runList.append(rsp)
|
||
|
textdata = runList[len(runList) - 1][0]["content"]
|
||
|
print(textdata)
|
||
|
parsed_data = json_repair.loads(textdata)
|
||
|
print(parsed_data)
|
||
|
for res in parsed_data:
|
||
|
if (res["answer"] == "无关"):
|
||
|
print("无关", res["name"])
|
||
|
map = resMap.get(image)
|
||
|
if map:
|
||
|
#存在map说明之前已经保存过了
|
||
|
resMap[image]=map+","+res["text"]
|
||
|
else:
|
||
|
resMap[image]=res["text"]
|
||
|
out=''
|
||
|
if(len(resMap)>0):
|
||
|
for key,value in resMap:
|
||
|
out+=f"在{key}图片中,{value}以上内容在文档中未出现相关描述<br>"
|
||
|
yield out
|
||
|
else:
|
||
|
yield "文档图片信息检查----图文符合要求"
|
||
|
shutil.rmtree(imagePath)
|
||
|
# except Exception as e:
|
||
|
# yield f"文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
|
||
|
# return
|
||
|
for i in checkImageText("1.docx"):
|
||
|
print(i)
|
||
|
# import docx
|
||
|
# doc = docx.Document('1.docx')
|
||
|
# dict_rel = doc.part._rels # rels其实是个目录
|
||
|
# for rel in dict_rel:
|
||
|
# rel = dict_rel[rel]
|
||
|
# print("rel", rel.target_ref)
|
||
|
# if "image" in rel.target_ref:
|
||
|
# # create_dir(desc_path)
|
||
|
# img_name = re.findall("/(.*)", rel.target_ref)[0] # windos:/
|
||
|
# print("img_name", img_name)
|
||
|
# word_name = os.path.splitext("1.docx")[0]
|
||
|
# print("word_name", word_name)
|
||
|
# #检查文件路径分隔符(os.sep),并根据不同的操作系统(Windows或Unix/Linux)处理文件名。
|
||
|
# if os.sep in word_name:
|
||
|
# new_name = word_name.split('\\')[-1]
|
||
|
# else:
|
||
|
# new_name = word_name.split('/')[-1]
|
||
|
# img_name = f'{new_name}_{img_name}'
|
||
|
# print(img_name)
|
||
|
# desc_path='workspace'
|
||
|
# with open(f'{desc_path}/{img_name}', "wb") as f:
|
||
|
# f.write(rel.target_part.blob)
|
||
|
# #
|
||
|
# # # prompt='''
|
||
|
# # # .根据上述文本判断,是否为非泛化的公司或组织名称,你可以使用工具利用互联网查询,你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校,委员单位]选项中选择答案,回答格式[{“placeName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
|
||
|
# # # '''
|
||
|
# llm_cfg_vl = {
|
||
|
# #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
|
||
|
# 'model':"qwen-vl-max",
|
||
|
# 'model_server': 'DashScope', # base_url, also known as api_base
|
||
|
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
|
||
|
# }
|
||
|
# botvl = Assistant(llm=llm_cfg_vl,
|
||
|
# name='Assistant',
|
||
|
# # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
|
||
|
# )
|
||
|
# messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
|
||
|
# runList = []
|
||
|
# for rsp in botvl.run(messages):
|
||
|
# runList.append(rsp)
|
||
|
# print(rsp)
|
||
|
# data = runList[len(runList) - 1][0]["content"]
|
||
|
# print(str(data))
|
||
|
|