3 changed files with 346 additions and 0 deletions
@ -0,0 +1,52 @@ |
|||
# from paddlenlp import Taskflow |
|||
#checkPlaceNameServer |
|||
# tagTask1 = Taskflow("ner",device_id=2,precision='fp16')##checkPlaceName |
|||
# from flask import Flask, request, jsonify |
|||
from fastapi import FastAPI, Request |
|||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig |
|||
import uvicorn |
|||
from fastapi.responses import JSONResponse |
|||
from pydantic import BaseModel |
|||
app = FastAPI() |
|||
|
|||
import sys |
|||
sys.path.append("..") |
|||
from ner_model import NERModel |
|||
# model = NERModel("bert", "shibing624/bert4ner-base-chinese") |
|||
model = NERModel("bertspan", "shibing624/bertspan4ner-base-chinese",cuda_device=5) |
|||
class RequestData(BaseModel): |
|||
data: dict |
|||
@app.post("/taskflow/checkPlaceNameServer") |
|||
async def process_request(request: RequestData): |
|||
global model |
|||
# 提取文本数据 |
|||
text_data = request.data.get('text') |
|||
# 处理文本数据,例如检查错误等 |
|||
# 这里您可以添加实际的逻辑来检查文档错误 |
|||
predictions, raw_outputs, entities = model.predict(text_data) |
|||
|
|||
# 返回响应 |
|||
return JSONResponse(content={"status": "success", "data": entities}, status_code=200) |
|||
if __name__ == "__main__": |
|||
uvicorn.run(app, host="0.0.0.0", port=8191) |
|||
# # 创建一个锁对象 |
|||
# import threading |
|||
# app = Flask(__name__) |
|||
# lock = threading.Lock() |
|||
# #多线程但是每次只处理一个请求,多余的请求需要排队 |
|||
# @app.route('/taskflow/checkPlaceNameServer', methods=['POST']) |
|||
# def process_request(): |
|||
# with lock: |
|||
# data = request.get_json() |
|||
# # print("data",data) |
|||
# # 提取文本数据 |
|||
# text_data = data.get('data', {}).get('text') |
|||
# # 处理文本数据,例如检查错误等 |
|||
# # 这里您可以添加实际的逻辑来检查文档错误 |
|||
# predictions, raw_outputs, entities =model.predict(text_data) |
|||
# # 示例:简单打印接收到的文本 |
|||
# # # 返回响应 |
|||
# return jsonify({"status": "success", "data": entities}), 200 |
|||
|
|||
# if __name__ == '__main__': |
|||
# app.run(threaded=True,port=8191) |
@ -0,0 +1,71 @@ |
|||
# from paddlenlp import Taskflow |
|||
# similarity1 = Taskflow("text_similarity",device_id=3,precision='fp16')##checkRepeatText |
|||
# from flask import Flask, request, jsonify |
|||
# import threading |
|||
# app = Flask(__name__) |
|||
|
|||
# # 创建一个锁对象 |
|||
# lock = threading.Lock() |
|||
# @app.route('/taskflow/checkRepeatText', methods=['POST']) |
|||
# def process_request(): |
|||
# with lock: |
|||
# data = request.get_json() |
|||
# # print("data",data) |
|||
# # 提取文本数据 |
|||
# text_data = data.get('data', {}).get('text') |
|||
# # 处理文本数据,例如检查错误等 |
|||
# # 这里您可以添加实际的逻辑来检查文档错误 |
|||
# res =similarity1(text_data) |
|||
# # 示例:简单打印接收到的文本 |
|||
# # # 返回响应 |
|||
# return jsonify({"status": "success", "data": res}), 200 |
|||
|
|||
# if __name__ == '__main__': |
|||
# app.run(threaded=True,port=8192) |
|||
from sentence_transformers import SentenceTransformer, util |
|||
import itertools |
|||
from fastapi import FastAPI, Request |
|||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig |
|||
import uvicorn |
|||
from fastapi.responses import JSONResponse |
|||
from pydantic import BaseModel |
|||
import torch |
|||
app = FastAPI() |
|||
model = SentenceTransformer("shibing624/text2vec-base-chinese",device="npu:5") |
|||
|
|||
class RequestData(BaseModel): |
|||
data: dict |
|||
@app.post("/taskflow/checkRepeatText") |
|||
async def process_request(request: RequestData): |
|||
global model |
|||
# 提取文本数据 |
|||
text_data = request.data.get('text') |
|||
a=text_data[0][0] |
|||
b=text_data[0][1] |
|||
emb_a = model.encode(a) |
|||
emb_b = model.encode(b) |
|||
cos_sim = util.cos_sim(emb_a, emb_b) |
|||
results = [] |
|||
results.append({"text1":a,"text2":b,"similarity":cos_sim.item()}) |
|||
# 返回响应 |
|||
return JSONResponse(content={"status": "success", "data": results}, status_code=200) |
|||
@app.post("/taskflow/getRepeatText") |
|||
async def process_request(request: RequestData): |
|||
global model |
|||
# 提取文本数据 |
|||
text_data = request.data.get('text') |
|||
allcorpus =text_data[0] #全部文档信息 |
|||
query=text_data[1] #要查询的文档信息 |
|||
corpus_embeddings = model.encode(allcorpus, convert_to_tensor=True) |
|||
top_k = min(4, len(allcorpus)) |
|||
query_embedding = model.encode(query, convert_to_tensor=True) |
|||
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] |
|||
top_results = torch.topk(cos_scores, k=top_k) |
|||
results = [] |
|||
for score, idx in zip(top_results[0], top_results[1]): |
|||
print(allcorpus[idx], "(Score: {:.4f})".format(score.item())) |
|||
results.append({"text1":allcorpus[idx],"text2":query,"similarity":score.item()}) |
|||
# 返回响应 |
|||
return JSONResponse(content={"status": "success", "data": results}, status_code=200) |
|||
if __name__ == "__main__": |
|||
uvicorn.run(app, host="0.0.0.0", port=8192) |
@ -0,0 +1,223 @@ |
|||
import threading |
|||
# from checkPlaceName import checkPlaceName |
|||
# from checkRepeatText import checkRepeatText |
|||
# from checkCompanyName import checkCompanyName |
|||
# from checkDocumentError import checkDocumentError |
|||
# from checkTitleName import checkTitleName |
|||
# from myLogger import outLog |
|||
# import time |
|||
# def run_check_company_name(filename,user_id): |
|||
# for i in checkCompanyName(filename,user_id): |
|||
# pass |
|||
|
|||
# def run_get_document_error(filename,user_id): |
|||
# for i in checkDocumentError(filename,user_id): |
|||
# pass |
|||
# def runcheckTitleName(filename,user_id): |
|||
# for i in checkTitleName(filename,user_id): |
|||
# pass |
|||
# def runcheckRepeatText(filename,user_id): |
|||
# for i in checkRepeatText(filename,user_id): |
|||
# pass |
|||
# def runcheckPlaceName(filename,user_id): |
|||
# for i in checkPlaceName(filename,user_id): |
|||
# pass |
|||
|
|||
# def get(user_id): |
|||
# time.sleep(5) |
|||
# while True: |
|||
# if outLog.is_done(user_id): |
|||
# break |
|||
# q = outLog.get_queueData(user_id) |
|||
# if q: |
|||
# text = q.pop(0) |
|||
# print(text) |
|||
# print("打印结束") |
|||
|
|||
# filename = "17.docx" |
|||
|
|||
# # 创建线程getapp |
|||
# thread1 = threading.Thread(target=run_check_company_name, args=(filename,"1")) |
|||
# thread2 = threading.Thread(target=run_get_document_error, args=(filename,"1")) |
|||
# thread3 = threading.Thread(target=runcheckTitleName, args=(filename,"1")) |
|||
# thread4 = threading.Thread(target=runcheckRepeatText, args=(filename,"1")) |
|||
# thread5 = threading.Thread(target=runcheckPlaceName, args=(filename,"1")) |
|||
# thread6 = threading.Thread(target=get, args=("1",)) |
|||
# thread1 = threading.Thread(target=getapp, args=(filename,)) |
|||
# thread2 = threading.Thread(target=getapp, args=(filename,)) |
|||
# thread3 = threading.Thread(target=getapp, args=(filename,)) |
|||
# thread4 = threading.Thread(target=getapp, args=(filename,)) |
|||
# thread5 = threading.Thread(target=getapp, args=(filename,)) |
|||
# thread6 = threading.Thread(target=getapp, args=("1",)) |
|||
# # 启动线程 |
|||
# thread1.start() |
|||
# thread2.start() |
|||
# thread3.start() |
|||
# thread4.start() |
|||
# thread5.start() |
|||
# thread6.start() |
|||
# # 等待线程完成 |
|||
# thread1.join() |
|||
# thread2.join() |
|||
# thread3.join() |
|||
# thread4.join() |
|||
# thread5.join() |
|||
# thread6.join() |
|||
# print("Both tasks completed.") |
|||
# from pycorrector.macbert.macbert_corrector import MacBertCorrector |
|||
|
|||
# m = MacBertCorrector("models") |
|||
# for i in range(10): |
|||
# i = m.correct("行政捡查是行政机关覆行政府职能、管理经济社会事务的重要方式,开展计划统筹是行政检查控总量、提质效的重要措施和手段,直接影响改革或得感和社会满意度") |
|||
# print(i) |
|||
# import re |
|||
# import json |
|||
# import json_repair |
|||
# import math |
|||
# import os |
|||
# import platform |
|||
# import torch |
|||
# import torch_npu |
|||
# import operator |
|||
# from torch_npu.contrib import transfer_to_npu |
|||
# torch_device = "npu:4" # 0~7 |
|||
# torch.npu.set_device(torch.device(torch_device)) |
|||
# torch.npu.set_compile_mode(jit_compile=False) |
|||
# from transformers import BertTokenizerFast,BertForMaskedLM |
|||
# # option = {} |
|||
# # option["NPU_FUZZY_COMPILE_BLACKLIST"] = "Tril" |
|||
# # torch.npu.set_option(option) |
|||
# print("torch && torch_npu import successfully") |
|||
|
|||
# DEFAULT_CKPT_PATH = 'macbert4csc' |
|||
# #models=----macbert4csc-base-chinese |
|||
# model = BertForMaskedLM.from_pretrained( |
|||
# DEFAULT_CKPT_PATH, |
|||
# torch_dtype=torch.float16, |
|||
# device_map=torch_device |
|||
# ).npu().eval() |
|||
# tokenizer = BertTokenizerFast.from_pretrained(DEFAULT_CKPT_PATH) |
|||
# def get_errors(corrected_text, origin_text): |
|||
# sub_details = [] |
|||
# for i, ori_char in enumerate(origin_text): |
|||
# if ori_char in [' ', '“', '”', '‘', '’', '琊', '\n', '…', '—', '擤']: |
|||
# # add unk word |
|||
# corrected_text = corrected_text[:i] + ori_char + corrected_text[i:] |
|||
# continue |
|||
# if i >= len(corrected_text): |
|||
# continue |
|||
# if ori_char != corrected_text[i]: |
|||
# if ori_char.lower() == corrected_text[i]: |
|||
# # pass english upper char |
|||
# corrected_text = corrected_text[:i] + ori_char + corrected_text[i + 1:] |
|||
# continue |
|||
# sub_details.append((ori_char, corrected_text[i], i, i + 1)) |
|||
# sub_details = sorted(sub_details, key=operator.itemgetter(2)) |
|||
# return corrected_text, sub_details |
|||
# result = [] |
|||
# def getapp(gettext): |
|||
# result = [] |
|||
# batchNum = 20 |
|||
# sentences = re.split(r'[。\n]', gettext) |
|||
# # 去掉空字符 |
|||
# sentences = [sentence.strip() for sentence in sentences if sentence.strip()] |
|||
# # 计算总字符数 |
|||
# total_chars = len(sentences) |
|||
|
|||
# # 计算有多少份 |
|||
# num_chunks = math.ceil(total_chars / batchNum) |
|||
|
|||
# # 按batchNum字为一份进行处理 |
|||
# chunks = [sentences[i:i + batchNum] for i in range(0, total_chars, batchNum)] |
|||
# # 打印每一份的内容 |
|||
# err = [] |
|||
# for i, chunk in enumerate(chunks): |
|||
# inputs = tokenizer(chunk, padding=True, return_tensors='pt').to(torch_device) |
|||
# with torch.no_grad(): |
|||
# outputs = model(**inputs) |
|||
# for id, (logit_tensor, sentence) in enumerate(zip(outputs.logits, chunk)): |
|||
# decode_tokens_new = tokenizer.decode( |
|||
# torch.argmax(logit_tensor, dim=-1), skip_special_tokens=True).split(' ') |
|||
# decode_tokens_new = decode_tokens_new[:len(sentence)] |
|||
# if len(decode_tokens_new) == len(sentence): |
|||
# probs = torch.max(torch.softmax(logit_tensor, dim=-1), dim=-1)[0].cpu().numpy() |
|||
# decode_str = '' |
|||
# for i in range(len(sentence)): |
|||
# if probs[i + 1] >= 0.7: |
|||
# decode_str += decode_tokens_new[i] |
|||
# else: |
|||
# decode_str += sentence[i] |
|||
# corrected_text = decode_str |
|||
# else: |
|||
# corrected_text = sentence |
|||
# print(corrected_text) |
|||
#outputs = model(**tokenizer(chunk, padding=True, return_tensors='pt').to(torch_device)) |
|||
# for ids, text in zip(outputs.logits, chunk): |
|||
# _text = tokenizer.decode(torch.argmax(ids, dim=-1), skip_special_tokens=True).replace(' ', '') |
|||
# corrected_text = _text[:len(text)] |
|||
# corrected_text, details = get_errors(corrected_text, text) |
|||
# print(text, ' => ', corrected_text, details) |
|||
# result.append((corrected_text, details)) |
|||
# for i, sent in enumerate(chunk): |
|||
# decode_tokens = tokenizer.decode(outputs[i], skip_special_tokens=True).replace(' ', '') |
|||
# corrected_sent = decode_tokens[:len(sent)] |
|||
# print(corrected_sent) |
|||
# corrected_sents.append(corrected_sent) |
|||
# from flask import Flask, request, jsonify |
|||
# import threading |
|||
# import time |
|||
# import re |
|||
# import math |
|||
# from macbert_corrector import MacBertCorrector |
|||
# m = MacBertCorrector("macbert4csc") |
|||
# app = Flask(__name__) |
|||
|
|||
# # 创建一个锁对象 |
|||
# lock = threading.Lock() |
|||
# #多线程但是每次只处理一个请求,多余的请求需要排队 |
|||
# @app.route('/taskflow/checkDocumentError', methods=['POST']) |
|||
# def process_request(): |
|||
# with lock: |
|||
|
|||
# data = request.get_json() |
|||
# # print("data",data) |
|||
# # 提取文本数据 |
|||
# text_data = data.get('data', {}).get('text', []) |
|||
# # print(text_data) |
|||
# # 处理文本数据,例如检查错误等 |
|||
# # 这里您可以添加实际的逻辑来检查文档错误 |
|||
# res = m.correct_batch(text_data) |
|||
# # 示例:简单打印接收到的文本 |
|||
# # # 返回响应 |
|||
# return jsonify({"status": "success", "data": res}), 200 |
|||
|
|||
# if __name__ == '__main__': |
|||
# app.run(threaded=True,port=5001) |
|||
from fastapi import FastAPI, Request |
|||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig |
|||
import uvicorn |
|||
from fastapi.responses import JSONResponse |
|||
from pydantic import BaseModel |
|||
app = FastAPI() |
|||
from macbert_corrector import MacBertCorrector |
|||
m = MacBertCorrector("macbert4csc") |
|||
|
|||
class RequestData(BaseModel): |
|||
data: dict |
|||
@app.post("/taskflow/checkDocumentError") |
|||
async def process_request(request: RequestData): |
|||
global m |
|||
# 提取文本数据 |
|||
text_data = request.data.get('text') |
|||
# 处理文本数据,例如检查错误等 |
|||
# 这里您可以添加实际的逻辑来检查文档错误 |
|||
|
|||
# print(text_data) |
|||
# 处理文本数据,例如检查错误等 |
|||
# 这里您可以添加实际的逻辑来检查文档错误 |
|||
res = m.correct_batch(text_data) |
|||
|
|||
# 返回响应 |
|||
return JSONResponse(content={"status": "success", "data": res}, status_code=200) |
|||
if __name__ == "__main__": |
|||
uvicorn.run(app, host="0.0.0.0", port=5001) |
Loading…
Reference in new issue