python_ai/checkDocumentError.py


								# -*- coding:utf-8 -*-

								from qwen_agent.agents import Assistant

								from docx import Document

								import re

								import json

								import json_repair

								import math

								from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship

								from docx.opc.oxml import parse_xml

								import requests

								# from myLogger import outLog

								import time

								def load_from_xml_v2(baseURI, rels_item_xml):

								    """

								    Return |_SerializedRelationships| instance loaded with the

								    relationships contained in *rels_item_xml*. Returns an empty

								    collection if *rels_item_xml* is |None|.

								    """

								    srels = _SerializedRelationships()

								    if rels_item_xml is not None:

								        rels_elm = parse_xml(rels_item_xml)

								        for rel_elm in rels_elm.Relationship_lst:

								            if rel_elm.target_ref in ('../NULL', 'NULL'):

								                continue

								            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))

								    return srels


								_SerializedRelationships.load_from_xml = load_from_xml_v2

								# import logging


								# outLog.logger = logging.getLogger("checkDocumentError")

								userLog=None

								llm_cfg = {

								    # 'model': 'qwen1.5-72b-chat',

								    'model': "qwen2-72b",

								    'model_server': 'http://127.0.0.1:1025/v1',  # base_url, also known as api_base

								    # 'api_key': 'sk-ea89cf04431645b185990b8af8c9bb13',

								}

								bot = Assistant(llm=llm_cfg,

								                name='Assistant',

								                # description='使用RAG检索并回答，支持文件类型：PDF/Word/PPT/TXT/HTML。'

								                system_message="你是一个错别字分析大师"

								                )

								# prompt='''

								# 是否存在错别字，若存在请指出，不做其他方面的校验，你只能在[存在，不存在，未知]选项中选择答案,

								# 回答格式[{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"},{“placeName”：“原文”,"改正后":"改正的内容","回答":"答案"}]，不做过多的解释,严格按回答格式作答;

								# '''

								prompt = '''

								请回答以上问题，[是，否]选项中选择答案,原文内容，标点符号保持不变，如果有错请给出详细的解析，没有错则不用给解析

								回答格式请按照以下json格式[{"placeName":"序号值","回答":"答案","解析","解析内容"},{"placeName":"序号值","回答":"答案","解析","解析内容"}]，不做过多的解释,严格按回答格式作答;

								'''


								def getDocxToTextAll(name):

								    docxPath = name

								    loopCount = 0

								    document = Document(docxPath)

								    # while True:

								    #     loopCount+=1

								    #     if(loopCount>=60):

								    #         raise Exception("文档读取超时，或文档存在问题无法读取")

								    #         break

								    #     try:

								    #         document = Document(docxPath)

								    #         break

								    #     except Exception as e:

								    #         time.sleep(1)

								    #         pass

								    # 逐段读取docx文档的内容

								    words = []

								    for paragraph in document.paragraphs:

								        # 判断该段落的标题级别

								        # 这里用isTitle()临时代表，具体见下文介绍的方法

								        text = paragraph.text

								        if text.strip():  # 非空判断

								            # print("非空")

								            words.append(text)

								    # 将所有段落文本拼接成一个字符串，并用换行符分隔

								    text = '\n'.join(words)


								    # 将文本写入txt文件

								    with open("checkDocumentError.txt", 'w', encoding='utf-8') as txt_file:

								        txt_file.write(text)


								def checkDocumentError(filename,user_id,outLog):

								    global userLog

								    userLog=outLog.get_queue(user_id,"checkDocumentError")

								    yield f"文档纠错---开始处理文档..."

								    try:

								        getDocxToTextAll(filename)

								    except Exception as e:

								        userLog.warning(e)

								        userLog.warning("文档纠错----文档无法打开，请检查文档内容")

								        yield "文档纠错----文件无法正常打开。可以尝试用WORD或WPS打开文件，进行修复并另存，用另存的文件再做一次尝试。"

								        outLog.mark_done(user_id, "checkDocumentError")

								        return

								    with open("checkDocumentError.txt", "r", encoding='utf-8') as f:

								        gettext = f.read()

								    yield f"文档纠错---开始解析文档..."  # 每次生成一个数字就发送

								    final_list = []

								    for item in documentErrorTask(gettext):

								        if isinstance(item, str):

								            yield item

								        else:

								            final_list = item  # 获取最终结果

								    resInfo = "发现错别字<br>"

								    if (len(final_list) > 0):

								        for i in final_list:

								            yuanwen = i["placeName"].replace("\n", "")

								            jianyi = i["jianyi"].replace("\n", "")

								            resInfo += "原文：" + yuanwen + "<br>建议：**" + jianyi + "**<br>"

								        yield resInfo


								    else:

								        yield "**未发现错别字**"

								        userLog.info("文档纠错---未发现错别字")

								    outLog.mark_done(user_id,"checkDocumentError")


								def documentErrorTask(text):

								    """

								    分批读取文本文件

								    :param file_path: 文件路径

								    :param batch_size: 每批处理的字符数

								    :return: 生成器，每次返回一批文本

								    """

								    yield "文档纠错---文档解析中...."

								    userLog.info("文档纠错---任务开始")

								    batchNum = 20

								    sentences = re.split(r'[。\n]', text)

								    # 去掉空字符

								    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

								    # 计算总字符数

								    total_chars = len(sentences)

								    # 计算有多少份

								    num_chunks = math.ceil(total_chars / batchNum)

								    # 按batchNum字为一份进行处理

								    chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)]

								    # 打印每一份的内容

								    err = []

								    for i, chunk in enumerate(chunks):

								        yield f"文档纠错---文档解析进度:{i + 1}/{num_chunks}"

								        try:

								            # url = "http://0.0.0.0:8190/taskflow/checkDocumentError"

								            # headers = {"Content-Type": "application/json"}

								            # data = {

								            #     "data": {

								            #         "text": chunk,

								            #     }

								            # }

								            # r = requests.post(url=url, headers=headers, data=json.dumps(data))

								            # res = json.loads(r.text)

								            url = "http://127.0.0.1:5001/taskflow/checkDocumentError"

								            headers = {"Content-Type": "application/json"}

								            data = {

								                "data": {

								                    "text": chunk,

								                }

								            }

								            r = requests.post(url=url, headers=headers, data=json.dumps(data))

								            res = json.loads(r.text)

								        except Exception as e:

								            userLog.warning(chunk)

								            userLog.warning("文档纠错--错别字识别出错\n")

								            userLog.warning(e)

								            continue

								        lines_with_greeting = [place for place in res["data"] if len(place['errors']) > 0]

								        userLog.debug(lines_with_greeting)

								        if len(lines_with_greeting) > 0:

								            num = 0

								            wenti = []  # 记录问题的数组

								            keyword_list = []  # 记录问题

								            for t in lines_with_greeting:

								                temp_errorWords = []

								                keyword = t['source']

								                keyword_list.append(keyword)

								                for item in t["errors"]:

								                    # for key, value in item['correction'].items():

								                    #     temp_errorWords.append(key)

								                    temp_errorWords.append(item[0])

								                wenti.append(

								                    # "{}：原文是{}。问题：【{}】这些字是否为当前原文的错别字".format(num, keyword, ",".join(temp_errorWords)))

								                    "{}：原文是{}。问题：当前原文是否存在错别字,只检查错被子，其他不做分析".format(num, keyword))

								                num += 1

								            words = "\n".join(wenti)

								            userLog.debug(words)

								            messages = [{'role': 'user', 'content': [{'text': words + prompt}]}]

								            runList = []

								            yield f"文档纠错---内容解析中..."  # 每次生成一个数字就发送

								            cishu = 0

								            for rsp in bot.run(messages):

								                runList.append(rsp)

								                if cishu > 3:

								                    cishu = 0

								                yield "文档纠错---内容解析中" + '.' * cishu

								                cishu += 1

								            data = runList[len(runList) - 1][0]["content"]

								            parsed_data = json_repair.loads(data.replace("\\", "").replace('`', ''))

								            userLog.debug(parsed_data)

								            resListerr = []

								            for place in parsed_data:

								                try:

								                    if place['回答'] == '是':

								                        place["placeName"] = keyword_list[int(place["placeName"])]

								                        place["jianyi"] = place["解析"]

								                        resListerr.append(place)

								                        userLog.info("文档纠错---原文：" + place["placeName"] + "<br>建议：" + place["jianyi"])

								                except Exception as e:

								                    userLog.warning(parsed_data)

								                    userLog.warning(place)

								                    userLog.warning("文档纠错--错别字提取出错\n")

								                    userLog.warning(e)

								                    continue

								            if (len(resListerr) > 0):

								                err.extend(resListerr)

								    # 打印总份数

								    yield "文档纠错---文档解析完成"

								    userLog.info("文档纠错---任务结束")

								    yield err