import re import os import docx from docx.document import Document from docx.text.paragraph import Paragraph from docx.parts.image import ImagePart from qwen_agent.agents import Assistant from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P import shutil import re import json_repair import uuid # 记录程序开始的时间戳 def getOutlineLevel(inputXml): """ 功能 从xml字段中提取出中的数字number 参数 inputXml 返回 number """ start_index = inputXml.find('', start_index) number = inputXml[start_index:end_index + 1] number = re.search("\d+", number).group() return number def isTitle(paragraph): """ 功能 判断该段落是否设置了大纲等级 参数 paragraph:段落 返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 """ # 如果是空行,直接返回None if paragraph.text.strip() == '': return None # 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 paragraphXml = paragraph._p.xml if paragraphXml.find('= 0: return getOutlineLevel(paragraphXml) # 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 targetStyle = paragraph.style while targetStyle is not None: # 如果在该级style中找到了大纲级别,返回 if targetStyle.element.xml.find('= 0: return getOutlineLevel(targetStyle.element.xml) else: targetStyle = targetStyle.base_style # 如果在段落、样式里都没有找到大纲级别,返回None return None # 该行只能有一个图片 def is_image(graph: Paragraph, doc: Document): images = graph._element.xpath('.//pic:pic') # 获取所有图片 for image in images: for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 if isinstance(part, ImagePart): return True return False # 获取图片(该行只能有一个图片) def get_ImagePart(graph: Paragraph, doc: Document): images = graph._element.xpath('.//pic:pic') # 获取所有图片 for image in images: for img_id in image.xpath('.//a:blip/@r:embed'): # 获取图片id part = doc.part.related_parts[img_id] # 根据图片id获取对应的图片 if isinstance(part, ImagePart): return part return None #寻找标题名称 def findTitleName(docxPath): yield '文档图片信息检查----检查是否存在详细设计方案' document = docx.Document(docxPath) # 逐段读取docx文档的内容 titleWords=[] firstTitle = 0 secondTitle = 0 sanjiTitle = 0 for paragraph in document.paragraphs: # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 text = paragraph.text if text.strip():#非空判断 level = isTitle(paragraph) if level=="0": firstTitle+=1 secondTitle = 0 if(text.find("附件")>=0): continue titleWords.append("一级标题:".format(firstTitle)+text) elif level=="1": secondTitle+=1 sanjiTitle=0 # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) # titleWords.append("第{}章的二级标题:".format(firstTitle,firstTitle,secondTitle)+text) elif level=="2": sanjiTitle += 1 # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) # titleWords.append("第{}章的三级标题".format(firstTitle, secondTitle,firstTitle, secondTitle,sanjiTitle) + text) findTitleName_llm_cfg = { # 'model':"qwen2-72b", # 'model_server': 'http://127.0.0.1:1025/v1', # base_url, also known as api_base 'model': "qwen2-72b-instruct", 'model_server': 'DashScope', # base_url, also known as api_base 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', } findTitleName_bot = Assistant(llm=findTitleName_llm_cfg, name='Assistant', # system_message='1:这样的是一级标题。1.1:这样的是二级标题。1.1.1:这样的是三级标题' ) prompt='''\n是文档的大纲,一级标题组成,哪一章存在与方案相关的内容 类似详细设计方案,详细服务方案,详细建设方案为最相关的,优先选择 类似设计方案,服务方案,建设方案为次相关,次级选择 类似方案是最后选择 按照这样的顺序选择最合适的 你只能从这两个答案中选择一个:{"name":"一级标题名称","answer":"存在"}或{"name":"","answer":"不存在"},不做过多的解释,严格按回答格式作答 ''' # print("\n".join(titleWords)+prompt) messages = [({'role': 'user', 'content': "\n".join(titleWords)+prompt})] runList=[] for rsp in findTitleName_bot.run(messages): runList.append(rsp) data = runList[len(runList) - 1][0]["content"] parsed_data = json_repair.loads(data.replace('`', '')) print(parsed_data) if(parsed_data["answer"]=="存在"): print("存在",parsed_data["name"]) yield parsed_data["name"] else: print("不存在",parsed_data["name"]) yield "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查" def saveImage(fileName,titleName,imagePath): fristName="" doc = docx.Document(fileName) for paragraph in doc.paragraphs: # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 text = paragraph.text if text.strip(): # 非空判断 level = isTitle(paragraph) if level == "0": fristName = text print(text) if level: levelText = f"{int(level) + 1}级标题-" + text else: # 空说明是表格或者图片 r = is_image(paragraph, doc) if r and fristName == titleName: part = get_ImagePart(paragraph, doc) img_name = levelText+"_"+ os.path.basename(part.partname) with open(f'{imagePath}/{img_name}', "wb") as f: f.write(part.blob) #保存完成后,上传大模型进行分析 def checkImageText(filename): llm_cfg_vl = { #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct 'model':"qwen-vl-max", 'model_server': 'DashScope', # base_url, also known as api_base 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', } botImage = Assistant(llm=llm_cfg_vl, name='Assistant', # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4 ) llm_cfg = { #'model': 'qwen1.5-72b-chat', 'model':"qwen2-72b-instruct", 'model_server': 'DashScope', # base_url, also known as api_base 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', } bot = Assistant(llm=llm_cfg, name='Assistant', # description='使用RAG检索并回答,支持文件类型:PDF/Word/PPT/TXT/HTML。' ) for titleName in findTitleName(filename): yield titleName if (titleName != "文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查"): yield "文档图片信息检查----文档内容解析中" imagePath = "Image" + str(uuid.uuid4()) os.mkdir(imagePath) saveImage(filename,titleName,imagePath) imagePathList = os.listdir(imagePath) count = 0 resMap={} for image in imagePathList: count+=1 yield f"文档图片信息检查---当前处理进度{count}/{len(imagePathList)}" outpath=os.path.join("imagePath", image) print(outpath) messagesImage = [{'role': 'user', "content": [{"image": outpath}, {"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}] runListImage = [] for rsp in botImage.run(messagesImage): runListImage.append(rsp) data = runListImage[len(runListImage) - 1][0]["content"] print(str(data)) prompt=''' 依次上述内容是否与文档有关,你只能在[无关,有关]选项中选择答案, 按照这样的格式回答[{“text”:“内容”,"answer":"答案"},{“text”:“内容”,"answer":"答案"}]不做过多的解释,严格按回答格式作答 ''' messages = [{'role': 'user', 'content': [{'text':str(data)+prompt},{"file":filename}]}] runList = [] for rsp in bot.run(messages): runList.append(rsp) textdata = runList[len(runList) - 1][0]["content"] print(textdata) parsed_data = json_repair.loads(textdata) print(parsed_data) for res in parsed_data: if (res["answer"] == "无关"): print("无关", res["name"]) map = resMap.get(image) if map: #存在map说明之前已经保存过了 resMap[image]=map+","+res["text"] else: resMap[image]=res["text"] out='' if(len(resMap)>0): for key,value in resMap: out+=f"在{key}图片中,{value}以上内容在文档中未出现相关描述
" yield out else: yield "文档图片信息检查----图文符合要求" shutil.rmtree(imagePath) # except Exception as e: # yield f"文档图片信息检查----未找到与详细设计方案相关内容,无法进行图文检查" # return for i in checkImageText("1.docx"): print(i) # import docx # doc = docx.Document('1.docx') # dict_rel = doc.part._rels # rels其实是个目录 # for rel in dict_rel: # rel = dict_rel[rel] # print("rel", rel.target_ref) # if "image" in rel.target_ref: # # create_dir(desc_path) # img_name = re.findall("/(.*)", rel.target_ref)[0] # windos:/ # print("img_name", img_name) # word_name = os.path.splitext("1.docx")[0] # print("word_name", word_name) # #检查文件路径分隔符(os.sep),并根据不同的操作系统(Windows或Unix/Linux)处理文件名。 # if os.sep in word_name: # new_name = word_name.split('\\')[-1] # else: # new_name = word_name.split('/')[-1] # img_name = f'{new_name}_{img_name}' # print(img_name) # desc_path='workspace' # with open(f'{desc_path}/{img_name}', "wb") as f: # f.write(rel.target_part.blob) # # # # # prompt=''' # # # .根据上述文本判断,是否为非泛化的公司或组织名称,你可以使用工具利用互联网查询,你只能在[非泛化的公司或组织名称,公益组织,统称,泛化名称,政府单位,机关单位,学校,委员单位]选项中选择答案,回答格式[{“placeName”:“名称”,"回答":"答案"}],不做过多的解释,严格按回答格式作答; # # # ''' # llm_cfg_vl = { # #'model': 'qwen1.5-72b-chat',qwen2-72b-instruct # 'model':"qwen-vl-max", # 'model_server': 'DashScope', # base_url, also known as api_base # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13', # } # botvl = Assistant(llm=llm_cfg_vl, # name='Assistant', # # system_message="你是一个地理专家,可以准确的判断地理位置,如果你不确定,可以使用工具"1_image4 # ) # messages = [{'role': 'user', "content": [{"image": "workspace/1.png"},{"text": '提取图片中的信息,每个信息进行自动分类,不要出现与图中无关的信息,不要删减,不要修改,不要总结内容,不做过多的解释,严格按要求作答'}]}] # runList = [] # for rsp in botvl.run(messages): # runList.append(rsp) # print(rsp) # data = runList[len(runList) - 1][0]["content"] # print(str(data))