You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

283 lines
13 KiB

5 months ago
import re
import os
import docx
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.parts.image import ImagePart
from qwen_agent.agents import Assistant
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
import shutil
import re
import json_repair
import uuid
# 记录程序开始的时间戳
def getOutlineLevel(inputXml):
"""
功能 从xml字段中提取出<w:outlineLvl w:val="number"/>中的数字number
参数 inputXml
返回 number
"""
start_index = inputXml.find('<w:outlineLvl')
end_index = inputXml.find('>', start_index)
number = inputXml[start_index:end_index + 1]
number = re.search("\d+", number).group()
return number
def isTitle(paragraph):
"""
功能 判断该段落是否设置了大纲等级
参数 paragraph:段落
返回 None:普通正文没有大纲级别 0:一级标题 1:二级标题 2:三级标题
"""
# 如果是空行,直接返回None
if paragraph.text.strip() == '':
return None
# 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别
paragraphXml = paragraph._p.xml
if paragraphXml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(paragraphXml)
# 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别
targetStyle = paragraph.style
while targetStyle is not None:
# 如果在该级style中找到了大纲级别,返回
if targetStyle.element.xml.find('<w:outlineLvl') >= 0:
return getOutlineLevel(targetStyle.element.xml)
else:
targetStyle = targetStyle.base_style
# 如果在段落、样式里都没有找到大纲级别,返回None
return None
# 该行只能有一个图片
def is_image(graph: Paragraph, doc: Document):
images = graph._element.xpath('.//pic:pic') # 获取所有图片
for image in images:
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart):
return True
return False
# 获取图片(该行只能有一个图片)
def get_ImagePart(graph: Paragraph, doc: Document):
images = graph._element.xpath('.//pic:pic') # 获取所有图片
for image in images:
for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id
part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片
if isinstance(part, ImagePart):
return part
return None
#寻找标题名称
def findTitleName(docxPath):
yield '文档图片信息检查----检查是否存在详细设计方案'
document = docx.Document(docxPath)
# 逐段读取docx文档的内容
titleWords=[]
firstTitle = 0
secondTitle = 0
sanjiTitle = 0
for paragraph in document.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip():#非空判断
level = isTitle(paragraph)
if level=="0":
firstTitle+=1
secondTitle = 0
if(text.find("附件")>=0):
continue
titleWords.append("一级标题:".format(firstTitle)+text)
elif level=="1":
secondTitle+=1
sanjiTitle=0
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text)
elif level=="2":
sanjiTitle += 1
# words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text)
# titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text)
findTitleName_llm_cfg = {
# 'model':"qwen2-72b",
# 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base
'model': "qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
findTitleName_bot = Assistant(llm=findTitleName_llm_cfg,
name='Assistant',
# system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题'
)
prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容
类似详细设计方案,详细服务方案详细建设方案为最相关的优先选择
类似设计方案服务方案建设方案为次相关次级选择
类似方案是最后选择
按照这样的顺序选择最合适的
你只能从这两个答案中选择一个{"name":"一级标题名称","answer":"存在"}{"name":"","answer":"不存在"}不做过多的解释,严格按回答格式作答
'''
# print("\n".join(titleWords)+prompt)
messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})]
runList=[]
for rsp in findTitleName_bot.run(messages):
runList.append(rsp)
data = runList[len(runList) - 1][0]["content"]
parsed_data = json_repair.loads(data.replace('`', ''))
print(parsed_data)
if(parsed_data["answer"]=="存在"):
print("存在",parsed_data["name"])
yield parsed_data["name"]
else:
print("不存在",parsed_data["name"])
yield "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
def saveImage(fileName,titleName,imagePath):
fristName=""
doc = docx.Document(fileName)
for paragraph in doc.paragraphs:
# 判断该段落的标题级别
# 这里用isTitle()临时代表,具体见下文介绍的方法
text = paragraph.text
if text.strip(): # 非空判断
level = isTitle(paragraph)
if level == "0":
fristName = text
print(text)
if level:
levelText = f"{int(level) + 1}级标题-" + text
else:
# 空说明是表格或者图片
r = is_image(paragraph, doc)
if r and fristName == titleName:
part = get_ImagePart(paragraph, doc)
img_name = levelText+"_"+ os.path.basename(part.partname)
with open(f'{imagePath}/{img_name}', "wb") as f:
f.write(part.blob)
#保存完成后,上传大模型进行分析
def checkImageText(filename):
llm_cfg_vl = {
#'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
'model':"qwen-vl-max",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
botImage = Assistant(llm=llm_cfg_vl,
name='Assistant',
# system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
)
llm_cfg = {
#'model': 'qwen1.5-72b-chat',
'model':"qwen2-72b-instruct",
'model_server': 'DashScope', # base_url, also known as api_base
'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
}
bot = Assistant(llm=llm_cfg,
name='Assistant',
# description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。'
)
for titleName in findTitleName(filename):
yield titleName
if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"):
yield "文档图片信息检查----文档内容解析中"
imagePath = "Image" + str(uuid.uuid4())
os.mkdir(imagePath)
saveImage(filename,titleName,imagePath)
imagePathList = os.listdir(imagePath)
count = 0
resMap={}
for image in imagePathList:
count+=1
yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}"
outpath=os.path.join("imagePath", image)
print(outpath)
messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
runListImage = []
for rsp in botImage.run(messagesImage):
runListImage.append(rsp)
data = runListImage[len(runListImage) - 1][0]["content"]
print(str(data))
prompt='''
依次上述内容是否与文档有关你只能在[无关有关]选项中选择答案,
按照这样的格式回答[{text内容,"answer":"答案"},{text内容,"answer":"答案"}]不做过多的解释,严格按回答格式作答
'''
messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}]
runList = []
for rsp in bot.run(messages):
runList.append(rsp)
textdata = runList[len(runList) - 1][0]["content"]
print(textdata)
parsed_data = json_repair.loads(textdata)
print(parsed_data)
for res in parsed_data:
if (res["answer"] == "无关"):
print("无关", res["name"])
map = resMap.get(image)
if map:
#存在map说明之前已经保存过了
resMap[image]=map+""+res["text"]
else:
resMap[image]=res["text"]
out=''
if(len(resMap)>0):
for key,value in resMap:
out+=f"{key}图片中,{value}以上内容在文档中未出现相关描述<br>"
yield out
else:
yield "文档图片信息检查----图文符合要求"
shutil.rmtree(imagePath)
# except Exception as e:
# yield f"文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"
# return
for i in checkImageText("1.docx"):
print(i)
# import docx
# doc = docx.Document('1.docx')
# dict_rel = doc.part._rels # rels其实是个目录
# for rel in dict_rel:
# rel = dict_rel[rel]
# print("rel", rel.target_ref)
# if "image" in rel.target_ref:
# # create_dir(desc_path)
# img_name = re.findall("/(.*)", rel.target_ref)[0] # windos:/
# print("img_name", img_name)
# word_name = os.path.splitext("1.docx")[0]
# print("word_name", word_name)
# #检查文件路径分隔符(os.sep),并根据不同的操作系统(Windows或Unix/Linux)处理文件名。
# if os.sep in word_name:
# new_name = word_name.split('\\')[-1]
# else:
# new_name = word_name.split('/')[-1]
# img_name = f'{new_name}_{img_name}'
# print(img_name)
# desc_path='workspace'
# with open(f'{desc_path}/{img_name}', "wb") as f:
# f.write(rel.target_part.blob)
# #
# # # prompt='''
# # # .根据上述文本判断,是否为非泛化的公司或组织名称,你可以使用工具利用互联网查询,你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校,委员单位]选项中选择答案,回答格式[{“placeName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答;
# # # '''
# llm_cfg_vl = {
# #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct
# 'model':"qwen-vl-max",
# 'model_server': 'DashScope', # base_url, also known as api_base
# 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',
# }
# botvl = Assistant(llm=llm_cfg_vl,
# name='Assistant',
# # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4
# )
# messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}]
# runList = []
# for rsp in botvl.run(messages):
# runList.append(rsp)
# print(rsp)
# data = runList[len(runList) - 1][0]["content"]
# print(str(data))