import re import time from docx import Document from pprint import pprint # from paddlenlp import Taskflow # # similarity = Taskflow("text_similarity", truncation=True, max_length=102400) def getOutlineLevel(inputXml): """ 功能 从xml字段中提取出中的数字number 参数 inputXml 返回 number """ start_index = inputXml.find('', start_index) number = inputXml[start_index:end_index + 1] number = re.search("\d+", number).group() return number def isTitle(paragraph): """ 功能 判断该段落是否设置了大纲等级 参数 paragraph:段落 返回 None:普通正文,没有大纲级别 0:一级标题 1:二级标题 2:三级标题 """ # 如果是空行,直接返回None if paragraph.text.strip() == '': return None # 如果该段落是直接在段落里设置大纲级别的,根据xml判断大纲级别 paragraphXml = paragraph._p.xml if paragraphXml.find('= 0: return getOutlineLevel(paragraphXml) # 如果该段落是通过样式设置大纲级别的,逐级检索样式及其父样式,判断大纲级别 targetStyle = paragraph.style while targetStyle is not None: # 如果在该级style中找到了大纲级别,返回 if targetStyle.element.xml.find('= 0: return getOutlineLevel(targetStyle.element.xml) else: targetStyle = targetStyle.base_style # 如果在段落、样式里都没有找到大纲级别,返回None return None def getDocxToText12biaoti(name): document = Document(name) # 逐段读取docx文档的内容 levelList=[] words=[] levelText="" i = 0 firstTitle = 0 secondTitle = 0 sanjiTitle = 0 for paragraph in document.paragraphs: # 判断该段落的标题级别 # 这里用isTitle()临时代表,具体见下文介绍的方法 text = paragraph.text if text.strip():#非空判断 # print("非空") # words.append(text) level = isTitle(paragraph) if level=="0": firstTitle+=1 secondTitle = 0 if(text.find("附件")>=0): continue words.append("{}:".format(firstTitle)+text) elif level=="1": secondTitle+=1 sanjiTitle=0 # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) words.append("{}.{}".format(firstTitle,secondTitle)+text) elif level=="2": sanjiTitle += 1 # words.append("\t"+"{}.{}".format(firstTitle,secondTitle)+text) words.append("{}.{}.{}".format(firstTitle, secondTitle,sanjiTitle) + text) # 将所有段落文本拼接成一个字符串,并用换行符分隔 print(len(words)) if len(words)==0: raise Exception("I know python!") text = '\n'.join(words) with open("ce1.txt", 'w',encoding="utf-8") as txt_file: txt_file.write(text) return words mobanList=[] dangqianList=[] errorList =[] # 将文本写入txt文件 # with open("ce模板.txt", 'r',encoding="utf-8") as txt_file: # for i in txt_file: # i=re.sub(r'[\t\n]', '', i) # mobanList.append(i) # pprint(mobanList) # dangqianList=getDocxToText12biaoti("1.docx") # if len(dangqianList)!=len(mobanList): # print("标题数量与模板不一致") # for num in range(len(mobanList)): # moban = mobanList[num] # dangqian= dangqianList[num] # fenshu=similarity([[dangqian,moban]]) # pprint(fenshu) # if (fenshu[0]["similarity"]<0.85): # errorList.append(dangqianList) # getDocxToText12biaoti("1.docx") # pprint(errorList) prompt = '''{}这是文档大纲,根据大纲分析文档中是否有{}这块内容的描述,若不存在请回答不存在 ''' dagang ="1" biaozhun="2" print(prompt.format(dagang, biaozhun))