You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

282 lines
13 KiB

import re
import os
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.parts.image import ImagePart
from qwen_agent.agents import Assistant
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
import shutil
import re
import json_repair
import uuid
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
# 该行只能有一个图片
def is_image(graph: Paragraph, doc: Document):
images = graph._element.xpath('.//pic:pic') # 获取所有图片
for image in images:
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart):
return True
return False
# 获取图片(该行只能有一个图片)
def get_ImagePart(graph: Paragraph, doc: Document):
images = graph._element.xpath('.//pic:pic') # 获取所有图片
for image in images:
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart):
return part
return None
#寻找标题名称
def findTitleName(docxPath):
yield '文档图片信息检查----检查是否存在详细设计方案'
document = docx.Document(docxPath)
# 逐段读取docx文档的内容
titleWords=[]
firstTitle = 0
secondTitle = 0
sanjiTitle = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
level = isTitle(paragraph)
if level=="0":
firstTitle+=1
secondTitle = 0
if(text.find("附件")>=0):
continue
titleWords.append("一级标题:".format(firstTitle)+text)
elif level=="1":
secondTitle+=1
sanjiTitle=0
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
elif level=="2":
sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
findTitleName_llm_cfg = {
# 'model':"qwen2-72b",
# 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
'model': "qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
name='Assistant',
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
)
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择
类似设计方案,服务方案,建设方案为次相关,次级选择
类似方案是最后选择
按照这样的顺序选择最合适的
你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答
'''
# print("\n".join(titleWords)+prompt)
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
runList=[]
for rsp in findTitleName_bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
print(parsed_data)
if(parsed_data["answer"]=="存在"):
print("存在",parsed_data["name"])
yield parsed_data["name"]
else:
print("不存在",parsed_data["name"])
yield "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
def saveImage(fileName,titleName,imagePath):
fristName=""
doc = docx.Document(fileName)
for paragraph in doc.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip(): # 非空判断
level = isTitle(paragraph)
if level == "0":
fristName = text
print(text)
if level:
levelText = f"{int(level) + 1}级标题-" + text
else:
# 空说明是表格或者图片
r = is_image(paragraph, doc)
if r and fristName == titleName:
part = get_ImagePart(paragraph, doc)
img_name = levelText+"_"+ os.path.basename(part.partname)
with open(f'{imagePath}/{img_name}', "wb") as f:
f.write(part.blob)
#保存完成后,上传大模型进行分析
def checkImageText(filename):
llm_cfg_vl = {
#'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
'model':"qwen-vl-max",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
botImage = Assistant(llm=llm_cfg_vl,
name='Assistant',
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
)
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
for titleName in findTitleName(filename):
yield titleName
if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"):
yield "文档图片信息检查----文档内容解析中"
imagePath = "Image" + str(uuid.uuid4())
os.mkdir(imagePath)
saveImage(filename,titleName,imagePath)
imagePathList = os.listdir(imagePath)
count = 0
resMap={}
for image in imagePathList:
count+=1
yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}"
outpath=os.path.join("imagePath", image)
print(outpath)
messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
runListImage = []
for rsp in botImage.run(messagesImage):
runListImage.append(rsp)
data = runListImage[len(runListImage) - 1][0]["content"]
print(str(data))
prompt='''
依次上述内容是否与文档有关,你只能在[无关,有关]选项中选择答案,
按照这样的格式回答[{“text”:“内容”,"answer":"答案"},{“text”:“内容”,"answer":"答案"}]不做过多的解释,严格按回答格式作答
'''
messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}]
runList = []
for rsp in bot.run(messages):
runList.append(rsp)
textdata = runList[len(runList) - 1][0]["content"]
print(textdata)
parsed_data = json_repair.loads(textdata)
print(parsed_data)
for res in parsed_data:
if (res["answer"] == "无关"):
print("无关", res["name"])
map = resMap.get(image)
if map:
#存在map说明之前已经保存过了
resMap[image]=map+""+res["text"]
else:
resMap[image]=res["text"]
out=''
if(len(resMap)>0):
for key,value in resMap:
out+=f"{key}图片中,{value}以上内容在文档中未出现相关描述<br>"
yield out
else:
yield "文档图片信息检查----图文符合要求"
shutil.rmtree(imagePath)
# except Exception as e:
# yield f"文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
# return
for i in checkImageText("1.docx"):
print(i)
# import docx
# doc = docx.Document('1.docx')
# dict_rel = doc.part._rels # rels其实是个目录
# for rel in dict_rel:
# rel = dict_rel[rel]
# print("rel", rel.target_ref)
# if "image" in rel.target_ref:
# # create_dir(desc_path)
# img_name = re.findall("/(.*)", rel.target_ref)[0] # windos:/
# print("img_name", img_name)
# word_name = os.path.splitext("1.docx")[0]
# print("word_name", word_name)
# #检查文件路径分隔符(os.sep),并根据不同的操作系统(Windows或Unix/Linux)处理文件名。
# if os.sep in word_name:
# new_name = word_name.split('\\')[-1]
# else:
# new_name = word_name.split('/')[-1]
# img_name = f'{new_name}_{img_name}'
# print(img_name)
# desc_path='workspace'
# with open(f'{desc_path}/{img_name}', "wb") as f:
# f.write(rel.target_part.blob)
# #
# # # prompt='''
# # # .根据上述文本判断,是否为非泛化的公司或组织名称,你可以使用工具利用互联网查询,你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校,委员单位]选项中选择答案,回答格式[{“placeName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
# # # '''
# llm_cfg_vl = {
# #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
# 'model':"qwen-vl-max",
# 'model_server': 'DashScope', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
# }
# botvl = Assistant(llm=llm_cfg_vl,
# name='Assistant',
# # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
# )
# messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
# runList = []
# for rsp in botvl.run(messages):
# runList.append(rsp)
# print(rsp)
# data = runList[len(runList) - 1][0]["content"]
# print(str(data))