diff --git a/ruoyi-admin/pom.xml b/ruoyi-admin/pom.xml
index f3592cb..753e96a 100644
--- a/ruoyi-admin/pom.xml
+++ b/ruoyi-admin/pom.xml
@@ -198,12 +198,6 @@
-
- org.apache.pdfbox
- pdfbox
- 2.0.27
- test
-
diff --git a/ruoyi-admin/src/main/resources/application-dev.yml b/ruoyi-admin/src/main/resources/application-dev.yml
index 0638d2d..00042d9 100644
--- a/ruoyi-admin/src/main/resources/application-dev.yml
+++ b/ruoyi-admin/src/main/resources/application-dev.yml
@@ -52,7 +52,7 @@ spring:
# url: jdbc:mysql://localhost:3306/zaojia?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true
# username: root
# password: root
- url: jdbc:mysql://10.1.21.250:3306/sjjtable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true
+ url: jdbc:mysql://10.1.21.250:3306/aitable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true
# url: jdbc:mysql://218.0.1.42:53306/sjjtable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true
username: root
password: 'HXj-6nR|D8xy*h#!I&:('
diff --git a/ruoyi-admin/src/main/resources/application-test.yml b/ruoyi-admin/src/main/resources/application-test.yml
index 18cce78..1c2ff48 100644
--- a/ruoyi-admin/src/main/resources/application-test.yml
+++ b/ruoyi-admin/src/main/resources/application-test.yml
@@ -283,5 +283,5 @@ justauth:
chat:
# 聊天机器人配置
filePath: /guoYanXinXi/data/software/sjjapp/minio/data/sjj/
- tempfilePath: /guoYanXinXi/data/software/sjjapp/app/tempfile/
+ tempfilePath: /guoYanXinXi/data/software/sjjapp/app/tempfile
chatUrl: http://127.0.0.1:8081
diff --git a/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java b/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
deleted file mode 100644
index cccd676..0000000
--- a/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
+++ /dev/null
@@ -1,332 +0,0 @@
-package org.dromara.test;
-
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.text.TextPosition;
-import org.junit.jupiter.api.DisplayName;
-import org.junit.jupiter.api.Test;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.*;
-import java.util.logging.Logger;
-import java.util.stream.Collectors;
-
-/**
- * PDF段落提取测试
- */
-@DisplayName("PDF段落提取测试")
-public class PdfExtractorTest {
-
- private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName());
-
- // 段落最小字数阈值
- private static final int MIN_PARAGRAPH_LENGTH = 20;
- // 最大缩进值
- private static final float MAX_INDENT_X = 100f;
- // 容差范围
- private static final float TOLERANCE = 2f;
-
- @Test
- @DisplayName("测试PDF段落提取")
- public void testExtractParagraphs() {
- String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
- List paragraphs = extractParagraphsFromPdf(pdfPath);
-
- System.out.println("提取段落总数: " + paragraphs.size());
- for (int i = 0; i < paragraphs.size(); i++) {
- if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){
- System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim());
- }
- }
- }
-
- /**
- * 从PDF文件中提取段落,基于x坐标统计来判断段落
- */
- public List extractParagraphsFromPdf(String filePath) {
- List paragraphs = new ArrayList<>();
- File pdfFile = new File(filePath);
-
- try {
- // 打开PDF文档
- PDDocument document = PDDocument.load(pdfFile);
- int totalPages = document.getNumberOfPages();
-
- // 第一步:收集所有x坐标和重复文本
- List xCoordinates = new ArrayList<>();
- Map textFrequency = new HashMap<>(); // 记录文本出现频率
-
- // 遍历每一页收集X坐标
- for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
- final int currentPage = pageIndex; // 用于匿名类中引用
-
- // 为每页创建文本提取器
- TextBlockStripper stripper = new TextBlockStripper();
- stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
- stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
- stripper.setStartPage(pageIndex + 1);
- stripper.setEndPage(pageIndex + 1);
- stripper.setSortByPosition(true);
- stripper.getText(document);
-
- for (TextBlock block : stripper.getTextBlocks()) {
- String text = block.getText().trim();
- if (text.length() > 0) {
- // 统计X坐标
- if (block.getX() < MAX_INDENT_X) {
- xCoordinates.add(block.getX());
- }
-
- // 统计文本频率
- if (text.length() >= MIN_PARAGRAPH_LENGTH) {
- textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
- }
- }
- }
- }
-
- if (xCoordinates.isEmpty()) {
- document.close();
- return paragraphs;
- }
-
- // 找出频率超过页面数一半的文本
- int frequencyThreshold = totalPages / 2;
- Set frequentTexts = textFrequency.entrySet().stream()
- .filter(entry -> entry.getValue() > frequencyThreshold)
- .map(Map.Entry::getKey)
- .collect(Collectors.toSet());
-
- System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)");
- frequentTexts.forEach(text -> System.out.println("高频文本: " +
- (text.length() > 50 ? text.substring(0, 47) + "..." : text) +
- " 出现次数: " + textFrequency.get(text)));
-
- // 第二步:统计x坐标频率并找出前两名
- Map xCounter = xCoordinates.stream()
- .collect(Collectors.groupingBy(x -> x, Collectors.counting()));
-
- List> mostCommonX = xCounter.entrySet().stream()
- .sorted(Map.Entry.comparingByValue().reversed())
- .limit(2)
- .collect(Collectors.toList());
-
- if (mostCommonX.size() < 2) {
- document.close();
- return paragraphs;
- }
-
- // 确保x_indent > x_normal
- float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
- float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
-
- System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE);
-
- // 第三步:根据基准x坐标提取段落
- List currentParagraph = new ArrayList<>();
- int num=311;
-
- // 逐页处理文本块
- for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
- List pageTextBlocks = new ArrayList<>();
-
- // 为每页创建文本提取器
- TextBlockStripper stripper = new TextBlockStripper();
- stripper.setSortByPosition(true);
- stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
- stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
- stripper.setStartPage(pageIndex + 1);
- stripper.setEndPage(pageIndex + 1);
- stripper.getText(document);
-
- // 获取当前页的文本块并排序
- pageTextBlocks.addAll(stripper.getTextBlocks());
- pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
-
- if(pageIndex==num){
- System.out.println(pageTextBlocks);
- }
-
- // 处理当前页的文本块
- for (TextBlock block : pageTextBlocks) {
- String lineText = block.getText().trim().replace('\n', ' ').trim();
- if (lineText.isEmpty()) {
- continue;
- }
-
- // 过滤高频文本
- if (frequentTexts.contains(lineText)) {
- if (pageIndex == num) {
- System.out.println("过滤高频文本: " +
- (lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText));
- }
- continue;
- }
-
- float currentX = block.getX();
-
- // 判断当前x坐标属于哪种类型
- boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE;
- boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE;
-
- // 如果是缩进位置,说明是新段落的开始
- if (isIndent) {
- if (!currentParagraph.isEmpty()) {
- String paragraphText = String.join("", currentParagraph);
- if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
- paragraphs.add(paragraphText);
- }
- currentParagraph.clear();
- }
- if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
- currentParagraph.add(lineText);
- }
- }
- // 如果是正常位置,追加到当前段落
- else if (isNormal) {
- if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
- currentParagraph.add(lineText);
- } else {
- currentParagraph.add(lineText);
- }
- }
- // 如果既不是缩进也不是正常位置,作为独立段落
- else {
- // 如果独立段落字数满足要求进行统计,不满足要求跳过
- if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
- if (!currentParagraph.isEmpty()) {
- String paragraphText = String.join("", currentParagraph);
- if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
- paragraphs.add(paragraphText);
- }
- currentParagraph.clear();
- }
- paragraphs.add(lineText);
- }
- }
- }
- }
-
- // 处理最后一个段落
- if (!currentParagraph.isEmpty()) {
- String paragraphText = String.join("", currentParagraph);
- if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
- paragraphs.add(paragraphText);
- }
- }
-
- document.close();
-
- } catch (IOException e) {
- logger.severe("提取PDF段落失败: " + e.getMessage());
- e.printStackTrace();
- }
-
- return paragraphs;
- }
-
- /**
- * 用于提取文本块的PDFTextStripper
- */
- private static class TextBlockStripper extends PDFTextStripper {
- private final List textBlocks = new ArrayList<>();
- private float lastY = -1;
- private String currentLine = "";
- private float currentX = 0;
-
- public TextBlockStripper() throws IOException {
- super();
- // 初始化
- textBlocks.clear();
- lastY = -1;
- currentLine = "";
- currentX = 0;
- }
-
- @Override
- protected void processTextPosition(TextPosition text) {
- float textX = text.getXDirAdj();
- float textY = text.getYDirAdj();
- float endX = text.getEndX();
- float endY = text.getEndY();
- // 如果Y坐标变化超过一定阈值,认为是新行
- if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
- // 保存上一行
- if (!currentLine.trim().isEmpty()) {
- textBlocks.add(new TextBlock(currentX, lastY, currentLine));
- }
-
- // 开始新行
- currentLine = text.getUnicode();
- currentX = textX;
- lastY = textY;
- } else {
- // 在同一行,追加文本
- currentLine += text.getUnicode();
- }
-
- super.processTextPosition(text);
- }
-
- @Override
- protected void startPage(PDPage page) throws IOException {
- // 清空textBlocks列表,避免累积所有页面的内容
- textBlocks.clear();
- lastY = -1;
- currentLine = "";
- currentX = 0;
- super.startPage(page);
- }
-
- @Override
- public void endDocument(PDDocument document) throws IOException {
- // 保存最后一行
- if (!currentLine.trim().isEmpty()) {
- textBlocks.add(new TextBlock(currentX, lastY, currentLine));
- }
- super.endDocument(document);
- }
-
- public List getTextBlocks() {
- return textBlocks;
- }
- }
-
- /**
- * 用于存储文本块信息的类
- */
- private static class TextBlock {
- private final float x;
- private final float y;
- private final String text;
-
- public TextBlock(float x, float y, String text) {
- this.x = x;
- this.y = y;
- this.text = text;
- }
-
- public float getX() {
- return x;
- }
-
- public float getY() {
- return y;
- }
-
- public String getText() {
- return text;
- }
-
- @Override
- public String toString() {
- return "TextBlock{" +
- "x=" + x +
- ", y=" + y +
- ", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' +
- '}';
- }
- }
-}
diff --git a/zaojiaManagement/zaojia-productManagement/pom.xml b/zaojiaManagement/zaojia-productManagement/pom.xml
index c270600..dc0d034 100644
--- a/zaojiaManagement/zaojia-productManagement/pom.xml
+++ b/zaojiaManagement/zaojia-productManagement/pom.xml
@@ -109,13 +109,22 @@
flexmark-all
0.64.8
-
+
+ net.lingala.zip4j
+ zip4j
+ 2.11.5
+
org.xhtmlrenderer
flying-saucer-pdf
9.1.22
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.27
+
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java
index 1d012db..69482c5 100644
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java
@@ -1,5 +1,6 @@
package org.dromara.productManagement.controller;
+import java.io.IOException;
import java.util.List;
import lombok.RequiredArgsConstructor;
@@ -75,7 +76,7 @@ public class SjjDocumentTasksController extends BaseController {
@Log(title = "审计局标书任务", businessType = BusinessType.INSERT)
@RepeatSubmit()
@PostMapping()
- public R add(@Validated(AddGroup.class) @RequestBody SjjDocumentTasksBo bo) {
+ public R add(@Validated(AddGroup.class) @RequestBody SjjDocumentTasksBo bo) throws IOException {
return toAjax(sjjDocumentTasksService.insertByBo(bo));
}
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java
index 1b7983a..0b957d3 100644
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java
@@ -82,7 +82,7 @@ public class SjjDocumentTasks extends TenantEntity {
/**
* 投标文件对象存储ID
*/
- private String bidDocOssId;
+ private String bidDocZipOssId;
private String deleteFlag;
}
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java
index 2ae93ab..0385e81 100644
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java
@@ -36,7 +36,7 @@ public class SjjDocumentTasksBo extends BaseEntity {
* 投标文件名称
*/
private String bidDocumentName;
- private String bidDocOssId;
+ private String bidDocZipOssId;
private String tenderDocOssId;
/**
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java
index abb039f..fbd9a16 100644
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java
@@ -69,5 +69,5 @@ public class SjjDocumentTasksVo implements Serializable {
/**
* 投标文件对象存储ID
*/
- private String bidDocOssId;
+ private String bidDocZipOssId;
}
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java
index df9758c..af4816d 100644
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java
@@ -5,6 +5,7 @@ import org.dromara.productManagement.domain.bo.SjjDocumentTasksBo;
import org.dromara.common.mybatis.core.page.TableDataInfo;
import org.dromara.common.mybatis.core.page.PageQuery;
+import java.io.IOException;
import java.util.Collection;
import java.util.List;
@@ -47,7 +48,7 @@ public interface ISjjDocumentTasksService {
* @param bo 审计局标书任务
* @return 是否新增成功
*/
- Boolean insertByBo(SjjDocumentTasksBo bo);
+ Boolean insertByBo(SjjDocumentTasksBo bo) throws IOException;
/**
* 修改审计局标书任务
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java
index a4972b9..be88978 100644
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java
@@ -1,6 +1,8 @@
package org.dromara.productManagement.service.impl;
import cn.dev33.satoken.stp.StpUtil;
+import net.lingala.zip4j.ZipFile;
+import net.lingala.zip4j.model.FileHeader;
import okhttp3.*;
import org.dromara.common.core.domain.model.LoginUser;
import org.dromara.common.core.utils.MapstructUtils;
@@ -23,12 +25,18 @@ import org.dromara.productManagement.domain.vo.SjjDocumentTasksVo;
import org.dromara.productManagement.domain.SjjDocumentTasks;
import org.dromara.productManagement.mapper.SjjDocumentTasksMapper;
import org.dromara.productManagement.service.ISjjDocumentTasksService;
+import org.dromara.productManagement.utils.PdfParserUtils;
-import java.io.IOException;
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Collection;
+import java.util.zip.ZipException;
/**
* 审计局标书任务Service业务层处理
@@ -110,18 +118,69 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
* @return 是否新增成功
*/
@Override
- public Boolean insertByBo(SjjDocumentTasksBo bo) {
+ public Boolean insertByBo(SjjDocumentTasksBo bo) throws IOException {
SjjDocumentTasks add = MapstructUtils.convert(bo, SjjDocumentTasks.class);
- String bidDocOssId = add.getBidDocOssId();
+ String bidDocZipOssId = add.getBidDocZipOssId();
+ String tenderDocPath="";
+ String tenderDocName="";
+ SysOssVo bidZipFileInfo = ossService.getById(Long.valueOf(bidDocZipOssId));
+ String bidZipName = bidZipFileInfo.getOriginalName();
+ String bidZipNameWithoutExt = bidZipName;
+ if (bidZipName.lastIndexOf(".") > 0) {
+ bidZipNameWithoutExt = bidZipName.substring(0, bidZipName.lastIndexOf("."));
+ }
+ String bidZipPath = fileRootPath + bidZipFileInfo.getFileName();
String tenderDocOssId = add.getTenderDocOssId();
- SysOssVo bidFileInfo = ossService.getById(Long.valueOf(bidDocOssId));
- String bidDocName = bidFileInfo.getOriginalName();
- String bidDocPath = fileRootPath + bidFileInfo.getFileName();
- SysOssVo tenderFileInfo = ossService.getById(Long.valueOf(tenderDocOssId));
- String tenderDocName = tenderFileInfo.getOriginalName();
- String tenderDocPath = fileRootPath + tenderFileInfo.getFileName();
- add.setBidDocumentName(bidDocName);
- add.setTenderDocumentName(tenderDocName);
+ if(StringUtils.isNotBlank(tenderDocOssId)){
+ SysOssVo tenderFileInfo = ossService.getById(Long.valueOf(tenderDocOssId));
+ tenderDocName = tenderFileInfo.getOriginalName();
+ tenderDocPath = fileRootPath + tenderFileInfo.getFileName();
+ add.setTenderDocumentName(tenderDocName);
+ }
+ add.setBidDocumentName(bidZipName);
+ // 创建唯一文件夹
+ String uniqueFolderName = "task_" + System.currentTimeMillis() + "_" + Math.abs(bidZipName.hashCode());
+ String taskFolder = tempfilePath + File.separator + uniqueFolderName;
+ File taskFolderDir = new File(taskFolder);
+ if (!taskFolderDir.exists()) {
+ taskFolderDir.mkdirs();
+ }
+
+ // 创建四个子文件夹
+ String bidOriginalDir = taskFolder + File.separator + "bid_original"; // 投标文件解压后的原始文件
+ String bidTxtDir = taskFolder + File.separator + "bid_txt"; // 投标文件解析后的TXT文件
+ String tenderOriginalDir = taskFolder + File.separator + "tender_original"; // 招标文件原始文件
+ String tenderTxtDir = taskFolder + File.separator + "tender_txt"; // 招标文件解析后的TXT文件
+ bidOriginalDir =bidOriginalDir+ File.separator + bidZipNameWithoutExt;
+
+ // 创建子文件夹
+ new File(bidOriginalDir).mkdirs();
+ new File(bidTxtDir).mkdirs();
+ new File(tenderOriginalDir).mkdirs();
+ new File(tenderTxtDir).mkdirs();
+ // 处理投标文件压缩包
+ processZipFile(bidZipPath, bidOriginalDir, bidTxtDir);
+
+ // 复制招标文件到任务文件夹
+ File tenderDoc = new File(tenderDocPath);
+ if (tenderDoc.exists()) {
+ // 复制招标文件到招标文件原始目录
+ File tenderDocCopy = new File(tenderOriginalDir, tenderDocName);
+ try (FileInputStream fis = new FileInputStream(tenderDoc);
+ FileOutputStream fos = new FileOutputStream(tenderDocCopy)) {
+ byte[] buffer = new byte[1024];
+ int length;
+ while ((length = fis.read(buffer)) > 0) {
+ fos.write(buffer, 0, length);
+ }
+ }
+
+ // 如果是PDF文件,解析其内容到招标文件TXT目录
+ if (tenderDocName.toLowerCase().endsWith(".pdf") && PdfParserUtils.isValidPdf(tenderDocCopy.getAbsolutePath())) {
+ processAndSavePdfContent(tenderDocCopy, tenderTxtDir, getSystemCharset());
+ }
+ }
+
add.setProgressStatus("PENDING");
validEntityBeforeSave(add);
boolean flag = baseMapper.insert(add) > 0;
@@ -137,13 +196,10 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
throw new IllegalArgumentException("无效的任务名称: " + add.getTaskName());
}
-// Request request = new Request.Builder()
-// .url(url+"?userId="+ LoginHelper.getUserId()+"&taskId="+taskId+"&filename="+filename+"&taskName="+taskName+"&priority="+priority)
-// .build();
HttpUrl.Builder urlBuilder = HttpUrl.parse(chatUrl +"/back/taskStart").newBuilder();
urlBuilder.addQueryParameter("userId", String.valueOf(LoginHelper.getUserId()));
urlBuilder.addQueryParameter("taskId", String.valueOf(add.getId()));
- urlBuilder.addQueryParameter("filename", bidDocPath+"\n"+tenderDocPath);
+ urlBuilder.addQueryParameter("filename", bidOriginalDir+"\n"+tenderOriginalDir);
urlBuilder.addQueryParameter("taskName", add.getTaskName());
urlBuilder.addQueryParameter("priority", "1");
Request request = new Request.Builder()
@@ -166,6 +222,303 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
return flag;
}
+ /**
+ * 处理ZIP文件:解压并解析PDF
+ *
+ * @param zipFilePath ZIP文件路径
+ * @param originalDir 存放解压原始文件的目录
+ * @param txtDir 存放PDF解析后TXT文件的目录
+ * @throws IOException 解压或解析过程中发生IO错误
+ * @throws ZipException ZIP文件处理错误
+ */
+ private void processZipFile(String zipFilePath, String originalDir, String txtDir) throws IOException, ZipException {
+ // 创建解压目标目录(如果不存在)
+ File extractDirFile = new File(originalDir);
+ if (!extractDirFile.exists()) {
+ extractDirFile.mkdirs();
+ }
+
+ // 检测最佳编码
+ Charset bestCharset = detectBestCharset(zipFilePath);
+
+ try {
+ // 使用zip4j解压文件
+ ZipFile zipFile = new ZipFile(zipFilePath);
+ zipFile.setCharset(bestCharset);
+
+ // 获取所有文件头
+ List fileHeaders = zipFile.getFileHeaders();
+ for (FileHeader fileHeader : fileHeaders) {
+ // 跳过目录项
+ if (fileHeader.isDirectory()) {
+ continue;
+ }
+
+ try {
+ // 获取文件名(不包括路径)
+ String fileName = new File(fileHeader.getFileName()).getName();
+ // 提取到指定目录,使用新的文件名
+ zipFile.extractFile(fileHeader, originalDir, fileName);
+ } catch (Exception e) {
+ // 如果使用检测到的编码解压失败,使用系统默认编码重试
+ try {
+ ZipFile fallbackZipFile = new ZipFile(zipFilePath);
+ fallbackZipFile.setCharset(getSystemCharset());
+ fallbackZipFile.extractFile(fileHeader.getFileName(), originalDir);
+ } catch (Exception fallbackEx) {
+ System.err.println("解压文件失败: " + fileHeader.getFileName() + ", 错误: " + fallbackEx.getMessage());
+ }
+ }
+ }
+
+ // 递归处理所有PDF文件
+ processAllPdfFiles(extractDirFile, txtDir, bestCharset);
+ } catch (Exception e) {
+ // 如果使用检测的编码失败,尝试直接整体解压
+ try {
+ ZipFile zipFile = new ZipFile(zipFilePath);
+ zipFile.setCharset(getSystemCharset());
+ zipFile.extractAll(originalDir);
+ processAllPdfFiles(extractDirFile, txtDir, getSystemCharset());
+ } catch (Exception e2) {
+ System.err.println("解压失败: " + e2.getMessage());
+ throw new IOException("解压失败", e2);
+ }
+ }
+ }
+
+ /**
+ * 检测ZIP文件的最佳字符编码
+ * 通过对比不同编码下的文件名可读性来确定最佳编码
+ */
+ private Charset detectBestCharset(String zipFilePath) {
+ // 常用的中文编码
+ Charset[] charsets = {
+ Charset.forName("GB18030"), // 首选,覆盖面最广的中文编码
+ Charset.forName("GBK"), // 次选,常用中文编码
+ StandardCharsets.UTF_8, // 通用编码
+ getSystemCharset() // 系统默认编码
+ };
+
+ int bestScore = -1;
+ Charset bestCharset = getSystemCharset(); // 默认使用系统字符集
+
+ try {
+ // 尝试每种编码并评分
+ for (Charset charset : charsets) {
+ int score = evaluateCharsetForZip(zipFilePath, charset);
+ if (score > bestScore) {
+ bestScore = score;
+ bestCharset = charset;
+ }
+ }
+ } catch (Exception e) {
+ System.err.println("检测字符集时出错: " + e.getMessage());
+ }
+
+ System.out.println("为ZIP文件选择的最佳字符集: " + bestCharset.name());
+ return bestCharset;
+ }
+
+ /**
+ * 评估特定字符集对ZIP文件的适用性
+ * 返回评分值,分数越高表示编码越适合
+ */
+ private int evaluateCharsetForZip(String zipFilePath, Charset charset) {
+ int score = 0;
+
+ try {
+ ZipFile zipFile = new ZipFile(zipFilePath);
+ zipFile.setCharset(charset);
+
+ List fileHeaders = zipFile.getFileHeaders();
+ for (FileHeader fileHeader : fileHeaders) {
+ if (fileHeader.isDirectory()) continue;
+
+ String fileName = fileHeader.getFileName();
+ score += evaluateString(fileName, charset);
+ }
+ } catch (Exception e) {
+ // 如果使用此编码打开ZIP失败,得分为-1
+ return -1;
+ }
+
+ return score;
+ }
+
+ /**
+ * 评估字符串在特定编码下的可读性
+ * 检查是否包含乱码字符,返回可读性得分
+ */
+ private int evaluateString(String str, Charset charset) {
+ int score = 0;
+
+ try {
+ // 将字符串转换为字节,然后再转回来,检查是否有信息丢失
+ byte[] bytes = str.getBytes(charset);
+ String decoded = new String(bytes, charset);
+
+ if (str.equals(decoded)) {
+ score += 10; // 完全匹配加10分
+ }
+
+ // 检查是否包含常见乱码字符
+ score -= countCharactersInRange(str, 0xFFFD, 0xFFFD); // Unicode替换字符
+ score -= countCharactersInRange(str, 0xD800, 0xDFFF) * 2; // Unicode代理区域
+
+ // 检查特殊字符比例
+ int specialChars = countSpecialCharacters(str);
+ if (specialChars > str.length() / 3) {
+ score -= 5; // 特殊字符过多,扣分
+ }
+
+ // 检查中文字符的存在
+ int chineseChars = countChineseCharacters(str);
+ if (chineseChars > 0) {
+ score += 5; // 包含中文字符加分
+ }
+ } catch (Exception e) {
+ score -= 10; // 转换异常,大幅扣分
+ }
+
+ return score;
+ }
+
+ /**
+ * 计算字符串中特定Unicode范围内的字符数量
+ */
+ private int countCharactersInRange(String str, int start, int end) {
+ int count = 0;
+ for (int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if (c >= start && c <= end) {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * 计算字符串中特殊字符(非字母、数字、常用标点)的数量
+ */
+ private int countSpecialCharacters(String str) {
+ int count = 0;
+ for (int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if (!Character.isLetterOrDigit(c) && !isCommonPunctuation(c)) {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * 判断字符是否为常用标点符号
+ */
+ private boolean isCommonPunctuation(char c) {
+ return c == '.' || c == ',' || c == ';' || c == ':' || c == '!' ||
+ c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
+ c == '{' || c == '}' || c == '_' || c == '-' || c == ' ' ||
+ c == '/' || c == '\\';
+ }
+
+ /**
+ * 计算字符串中中文字符的数量
+ */
+ private int countChineseCharacters(String str) {
+ int count = 0;
+ for (int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ if (isChinese(c)) {
+ count++;
+ }
+ }
+ return count;
+ }
+
+ /**
+ * 判断字符是否为中文字符
+ */
+ private boolean isChinese(char c) {
+ Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
+ return ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
+ || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
+ || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A;
+ }
+
+ /**
+ * 递归处理目录中的所有PDF文件
+ *
+ * @param directory 需要处理的目录
+ * @param txtOutputDir PDF解析后的TXT文件输出目录
+ * @param charset 字符集
+ */
+ private void processAllPdfFiles(File directory, String txtOutputDir, Charset charset) {
+ if (!directory.isDirectory()) {
+ return;
+ }
+
+ File[] files = directory.listFiles();
+ if (files == null) {
+ return;
+ }
+
+ for (File file : files) {
+ if (file.isDirectory()) {
+ // 递归处理子目录
+ processAllPdfFiles(file, txtOutputDir, charset);
+ } else if (file.getName().toLowerCase().endsWith(".pdf") && PdfParserUtils.isValidPdf(file.getAbsolutePath())) {
+ // 处理PDF文件
+ processAndSavePdfContent(file, txtOutputDir, charset);
+ }
+ }
+ }
+
+ /**
+ * 获取系统默认字符集
+ *
+ * @return 适合当前操作系统的字符集
+ */
+ private Charset getSystemCharset() {
+ String osName = System.getProperty("os.name").toLowerCase();
+ return osName.contains("win") ? Charset.forName("GBK") : StandardCharsets.UTF_8;
+ }
+
+ /**
+ * 处理PDF文件并保存为TXT
+ *
+ * @param pdfFile PDF文件
+ * @param outputDir 输出目录
+ * @param charset 字符集
+ */
+ private void processAndSavePdfContent(File pdfFile, String outputDir, Charset charset) {
+ try {
+
+ // 提取PDF段落
+ List paragraphs = PdfParserUtils.extractParagraphs(pdfFile.getAbsolutePath());
+
+ if (paragraphs.isEmpty()) {
+ return;
+ }
+
+ // 创建TXT文件名(替换扩展名)
+ String txtFileName = pdfFile.getName().replaceAll("\\.pdf$", ".txt");
+ File txtFile = new File(outputDir, txtFileName);
+
+ // 写入TXT文件
+ try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(txtFile), charset))) {
+
+ for (String paragraph : paragraphs) {
+ writer.write(paragraph);
+ writer.newLine();
+ writer.newLine(); // 段落间添加空行
+ }
+ }
+
+ } catch (Exception e) {
+ }
+ }
/**
* 修改审计局标书任务
*
@@ -205,8 +558,14 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
public Boolean ossRemoveById(List ids, Boolean b) {
List sjjDocumentTasksVos = baseMapper.selectVoByIds(ids);
for (SjjDocumentTasksVo sjjDocumentTasksVo : sjjDocumentTasksVos) {
- ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getTenderDocOssId())), true);
- ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getBidDocOssId())), true);
+ if (sjjDocumentTasksVo.getTenderDocOssId() != null ) {
+ ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getTenderDocOssId())), true);
+
+ }
+ if (sjjDocumentTasksVo.getBidDocZipOssId() != null ) {
+ ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getBidDocZipOssId())), true);
+
+ }
SjjDocumentTasks convert = MapstructUtils.convert(sjjDocumentTasksVo, SjjDocumentTasks.class);
convert.setDeleteFlag("Y");
baseMapper.updateById(convert);
@@ -214,3 +573,4 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
return true;
}
}
+
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java
index 274b7fa..47184b1 100644
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java
@@ -11,7 +11,10 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
+import java.util.stream.IntStream;
/**
* PDF解析工具类
@@ -29,6 +32,9 @@ import java.util.stream.Collectors;
public class PdfParserUtils {
private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class);
+
+ // 是否打印警告日志
+ private static boolean enableWarningLogs = false;
// 默认段落最小字数阈值
private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20;
@@ -36,7 +42,18 @@ public class PdfParserUtils {
private static final float DEFAULT_MAX_INDENT_X = 100f;
// 默认容差范围
private static final float DEFAULT_TOLERANCE = 2f;
+ // 默认线程池大小
+ private static final int DEFAULT_THREAD_POOL_SIZE = 32;
+ /**
+ * 设置是否启用警告日志
+ *
+ * @param enable 是否启用
+ */
+ public static void setEnableWarningLogs(boolean enable) {
+ enableWarningLogs = enable;
+ }
+
/**
* 从PDF文件中提取段落
*
@@ -57,6 +74,20 @@ public class PdfParserUtils {
* @return 提取的段落列表
*/
public static List extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) {
+ return extractParagraphs(filePath, minParagraphLength, maxIndentX, tolerance, DEFAULT_THREAD_POOL_SIZE);
+ }
+
+ /**
+ * 从PDF文件中提取段落,支持自定义参数和线程池大小
+ *
+ * @param filePath PDF文件路径
+ * @param minParagraphLength 最小段落长度
+ * @param maxIndentX 最大缩进值
+ * @param tolerance 容差范围
+ * @param threadPoolSize 线程池大小
+ * @return 提取的段落列表
+ */
+ public static List extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance, int threadPoolSize) {
List paragraphs = new ArrayList<>();
File pdfFile = new File(filePath);
@@ -65,73 +96,102 @@ public class PdfParserUtils {
return paragraphs;
}
+ ExecutorService executor = null;
try {
// 设置PDFBox选项,抑制字体警告
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
-
+ // 关闭PDFBox内部的警告日志
+ java.util.logging.Logger.getLogger("org.apache.pdfbox").setLevel(java.util.logging.Level.SEVERE);
+ // 关闭FontBox相关的警告
+ java.util.logging.Logger.getLogger("org.apache.fontbox").setLevel(java.util.logging.Level.SEVERE);
+ // 禁用PDFBox字体警告的另一种方式
+ System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
+ // 设置PDF处理相关配置
+ System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
+ // 忽略字体缺失警告
+ System.setProperty("org.apache.pdfbox.fontcache", "none");
+
// 打开PDF文档
PDDocument document = PDDocument.load(pdfFile);
-
+
// 设置PDFBox参数,提高对中文字体的兼容性
document.setResourceCache(null); // 禁用资源缓存,可能减少某些字体问题
-
+
int totalPages = document.getNumberOfPages();
+ // 创建线程池
+ executor = Executors.newFixedThreadPool(Math.min(threadPoolSize, totalPages));
+
// 第一步:收集所有x坐标和重复文本
- List xCoordinates = new ArrayList<>();
- Map textFrequency = new HashMap<>(); // 记录文本出现频率
-
- log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages);
-
- // 遍历每一页收集X坐标
- for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
- try {
- // 为每页创建文本提取器
- TextBlockStripper stripper = new TextBlockStripper();
- stripper.setSortByPosition(true);
- stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
- stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
- stripper.setStartPage(pageIndex + 1);
- stripper.setEndPage(pageIndex + 1);
- stripper.getText(document);
-
- for (TextBlock block : stripper.getTextBlocks()) {
- String text = block.getText().trim();
- if (text.length() > 0) {
- // 统计X坐标
- if (block.getX() < maxIndentX) {
- xCoordinates.add(block.getX());
- }
-
- // 统计文本频率
- if (text.length() >= minParagraphLength) {
- textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
+ ConcurrentLinkedQueue xCoordinates = new ConcurrentLinkedQueue<>();
+ ConcurrentLinkedQueue endXCoordinates = new ConcurrentLinkedQueue<>();
+ ConcurrentHashMap textFrequency = new ConcurrentHashMap<>(); // 记录文本出现频率
+
+ log.info("开始多线程解析PDF文件: {}, 总页数: {}, 线程池大小: {}", filePath, totalPages, threadPoolSize);
+
+ // 创建任务列表,每个任务处理一页
+ List> tasks = IntStream.range(0, totalPages)
+ .mapToObj(pageIndex -> (Callable) () -> {
+ try {
+ // 为每页创建文本提取器
+ TextBlockStripper stripper = new TextBlockStripper();
+ stripper.setSortByPosition(true);
+ stripper.setSuppressDuplicateOverlappingText(false);
+ stripper.setAddMoreFormatting(false);
+ stripper.setStartPage(pageIndex + 1);
+ stripper.setEndPage(pageIndex + 1);
+ stripper.getText(document);
+
+ for (TextBlock block : stripper.getTextBlocks()) {
+ String text = block.getText().trim();
+ if (text.length() > 0) {
+ // 统计X坐标
+ if (block.getX() < maxIndentX) {
+ xCoordinates.add(block.getX());
+ }
+ // 统计终点X坐标
+ endXCoordinates.add(block.getEndX());
+
+ // 统计文本频率(线程安全方式)
+ if (text.length() >= minParagraphLength) {
+ textFrequency.computeIfAbsent(text, k -> new AtomicInteger(0)).incrementAndGet();
+ }
}
}
+ } catch (Exception e) {
+ if (enableWarningLogs) {
+ log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage());
+ }
}
- } catch (Exception e) {
- log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage());
- // 继续处理下一页,而不是中断整个过程
- }
- }
+ return null;
+ })
+ .collect(Collectors.toList());
- if (xCoordinates.isEmpty()) {
- log.warn("未找到有效的X坐标,无法提取段落");
+ // 提交所有任务并等待完成
+ executor.invokeAll(tasks);
+
+ if (xCoordinates.isEmpty() || endXCoordinates.isEmpty()) {
+ if (enableWarningLogs) {
+ log.warn("未找到有效的X坐标,无法提取段落");
+ }
document.close();
+ if (executor != null) {
+ executor.shutdown();
+ }
return paragraphs;
}
// 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容)
int frequencyThreshold = totalPages / 2;
Set frequentTexts = textFrequency.entrySet().stream()
- .filter(entry -> entry.getValue() > frequencyThreshold)
+ .filter(entry -> entry.getValue().get() > frequencyThreshold)
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold);
// 统计x坐标频率并找出前两名(通常是正常段落和首行缩进)
- Map xCounter = xCoordinates.stream()
+ Map xCounter = new ArrayList<>(xCoordinates).stream()
.collect(Collectors.groupingBy(x -> x, Collectors.counting()));
List> mostCommonX = xCounter.entrySet().stream()
@@ -139,120 +199,206 @@ public class PdfParserUtils {
.limit(2)
.collect(Collectors.toList());
- if (mostCommonX.size() < 2) {
- log.warn("未找到足够的X坐标特征,无法区分段落缩进");
+ // 统计终点x坐标频率并找出最常见的值
+ Map endXCounter = new ArrayList<>(endXCoordinates).stream()
+ .collect(Collectors.groupingBy(x -> x, Collectors.counting()));
+
+ List> mostCommonEndX = endXCounter.entrySet().stream()
+ .sorted(Map.Entry.comparingByValue().reversed())
+ .limit(1)
+ .collect(Collectors.toList());
+
+ if (mostCommonX.size() < 2 || mostCommonEndX.isEmpty()) {
+ if (enableWarningLogs) {
+ log.warn("未找到足够的X坐标特征,无法区分段落缩进");
+ }
document.close();
+ if (executor != null) {
+ executor.shutdown();
+ }
return paragraphs;
}
// 确保x_indent > x_normal
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
+ float commonEndX = mostCommonEndX.get(0).getKey(); // 最常见的终点x坐标
- log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance);
-
- // 根据基准x坐标提取段落
- List currentParagraph = new ArrayList<>();
-
- // 逐页处理文本块
- for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
- try {
- List pageTextBlocks = new ArrayList<>();
-
- // 为每页创建文本提取器
- TextBlockStripper stripper = new TextBlockStripper();
- stripper.setSortByPosition(true);
- stripper.setSuppressDuplicateOverlappingText(false);
- stripper.setAddMoreFormatting(false);
- stripper.setStartPage(pageIndex + 1);
- stripper.setEndPage(pageIndex + 1);
- stripper.getText(document);
-
- // 获取当前页的文本块并排序
- pageTextBlocks.addAll(stripper.getTextBlocks());
- pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
-
- // 处理当前页的文本块
- for (TextBlock block : pageTextBlocks) {
- String lineText = block.getText().trim().replace('\n', ' ').trim();
- if (lineText.isEmpty()) {
- continue;
- }
-
- // 过滤高频文本
- if (frequentTexts.contains(lineText)) {
- continue;
- }
-
- float currentX = block.getX();
-
- // 判断当前x坐标属于哪种类型
- boolean isIndent = Math.abs(currentX - xIndent) <= tolerance;
- boolean isNormal = Math.abs(currentX - xNormal) <= tolerance;
-
- // 如果是缩进位置,说明是新段落的开始
- if (isIndent) {
- if (!currentParagraph.isEmpty()) {
- String paragraphText = String.join("", currentParagraph);
- if (paragraphText.length() >= minParagraphLength) {
- paragraphs.add(paragraphText.trim());
- }
- currentParagraph.clear();
+ log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}, commonEndX={}", xNormal, xIndent, tolerance, commonEndX);
+
+ // 使用ConcurrentMap存储每页的段落
+ ConcurrentHashMap> pageParagraphs = new ConcurrentHashMap<>();
+
+ // 创建段落提取任务
+ List> paragraphTasks = IntStream.range(0, totalPages)
+ .mapToObj(pageIndex -> (Callable) () -> {
+ try {
+ List pageResult = new ArrayList<>();
+ List pageTextBlocks = new ArrayList<>();
+
+ // 为每页创建文本提取器
+ TextBlockStripper stripper = new TextBlockStripper();
+ stripper.setSortByPosition(true);
+ stripper.setSuppressDuplicateOverlappingText(false);
+ stripper.setAddMoreFormatting(false);
+ stripper.setStartPage(pageIndex + 1);
+ stripper.setEndPage(pageIndex + 1);
+ stripper.getText(document);
+
+ // 获取当前页的文本块并排序
+ pageTextBlocks.addAll(stripper.getTextBlocks());
+ pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
+
+ // 每页独立处理段落
+ List currentParagraph = new ArrayList<>();
+ boolean isInParagraph = false;
+
+ // 处理当前页的文本块
+ for (int i = 0; i < pageTextBlocks.size(); i++) {
+ TextBlock block = pageTextBlocks.get(i);
+ String lineText = block.getText().trim().replace('\n', ' ').trim();
+ if (lineText.isEmpty()) {
+ continue;
}
- if (lineText.length() >= minParagraphLength) {
- currentParagraph.add(lineText);
+
+ // 过滤高频文本
+ if (frequentTexts.contains(lineText)) {
+ continue;
}
- }
- // 如果是正常位置,追加到当前段落
- else if (isNormal) {
- if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
- currentParagraph.add(lineText);
- } else {
- currentParagraph.add(lineText);
+
+ float currentX = block.getX();
+ float currentEndX = block.getEndX();
+
+ // 判断当前x坐标属于哪种类型
+ boolean isIndent = Math.abs(currentX - xIndent) <= tolerance;
+ boolean isNormal = Math.abs(currentX - xNormal) <= tolerance;
+
+ // 如果是缩进位置,说明是新段落的开始
+ if (isIndent) {
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= minParagraphLength) {
+ pageResult.add(paragraphText.trim());
+ }
+ currentParagraph.clear();
+ }
+ if (lineText.length() >= minParagraphLength) {
+ currentParagraph.add(lineText);
+ isInParagraph = true;
+ }
}
- }
- // 如果既不是缩进也不是正常位置,作为独立段落
- else {
- // 如果独立段落字数满足要求进行统计,不满足要求跳过
- if (lineText.length() >= minParagraphLength) {
+ // 如果是正常位置
+ else if (isNormal) {
+ if (!isInParagraph) {
+ // 检查是否是段落的开始(终点x在最大值范围内)
+ if (currentEndX <= commonEndX * 0.95) { // 使用95%作为阈值
+ isInParagraph = true;
+ currentParagraph.add(lineText);
+ }
+ } else {
+ // 检查是否应该结束当前段落
+ if (currentEndX > commonEndX * 0.95) {
+ // 当前行结束,检查下一行
+ if (i + 1 < pageTextBlocks.size()) {
+ TextBlock nextBlock = pageTextBlocks.get(i + 1);
+ float nextX = nextBlock.getX();
+ boolean nextIsNormal = Math.abs(nextX - xNormal) <= tolerance;
+
+ if (!nextIsNormal) {
+ // 下一行不是正常位置,结束当前段落
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= minParagraphLength) {
+ pageResult.add(paragraphText.trim());
+ }
+ currentParagraph.clear();
+ isInParagraph = false;
+ } else {
+ // 下一行是正常位置,继续当前段落
+ currentParagraph.add(lineText);
+ }
+ } else {
+ // 已经是最后一行,结束当前段落
+ currentParagraph.add(lineText);
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= minParagraphLength) {
+ pageResult.add(paragraphText.trim());
+ }
+ currentParagraph.clear();
+ isInParagraph = false;
+ }
+ } else {
+ // 继续当前段落
+ currentParagraph.add(lineText);
+ }
+ }
+ }
+ // 如果既不是缩进也不是正常位置,作为独立段落
+ else {
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) {
- paragraphs.add(paragraphText.trim());
+ pageResult.add(paragraphText.trim());
}
currentParagraph.clear();
+ isInParagraph = false;
+ }
+ if (lineText.length() >= minParagraphLength) {
+ pageResult.add(lineText.trim());
}
- paragraphs.add(lineText.trim());
}
}
+
+ // 处理最后一个段落
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= minParagraphLength) {
+ pageResult.add(paragraphText.trim());
+ }
+ }
+
+ // 保存当前页的结果
+ pageParagraphs.put(pageIndex, pageResult);
+
+ } catch (Exception e) {
+ if (enableWarningLogs) {
+ log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage());
+ }
}
- } catch (Exception e) {
- log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage());
- // 继续处理下一页
- }
- }
-
- // 处理最后一个段落
- if (!currentParagraph.isEmpty()) {
- String paragraphText = String.join("", currentParagraph);
- if (paragraphText.length() >= minParagraphLength) {
- paragraphs.add(paragraphText.trim());
+ return null;
+ })
+ .collect(Collectors.toList());
+
+ // 提交所有段落提取任务并等待完成
+ executor.invokeAll(paragraphTasks);
+
+ // 按页码顺序合并所有段落
+ for (int i = 0; i < totalPages; i++) {
+ List pageParagraphList = pageParagraphs.get(i);
+ if (pageParagraphList != null) {
+ paragraphs.addAll(pageParagraphList);
}
}
-
+
document.close();
- log.info("PDF解析完成,提取段落数: {}", paragraphs.size());
-
+ log.info("PDF多线程解析完成,提取段落数: {}", paragraphs.size());
+
} catch (IOException e) {
log.error("提取PDF段落失败: {}", e.getMessage(), e);
+ } catch (InterruptedException e) {
+ log.error("多线程处理被中断: {}", e.getMessage(), e);
+ Thread.currentThread().interrupt();
+ } finally {
+ if (executor != null) {
+ executor.shutdown();
+ }
}
-
+
return paragraphs;
}
/**
* 检查PDF文件是否可被有效解析
- *
+ *
* @param filePath PDF文件路径
* @return 是否可解析
*/
@@ -261,14 +407,16 @@ public class PdfParserUtils {
if (!pdfFile.exists() || !pdfFile.isFile()) {
return false;
}
-
+
try {
PDDocument document = PDDocument.load(pdfFile);
int pageCount = document.getNumberOfPages();
document.close();
return pageCount > 0;
} catch (Exception e) {
- log.error("检查PDF有效性时出错: {}", e.getMessage());
+ if (enableWarningLogs) {
+ log.error("检查PDF有效性时出错: {}", e.getMessage());
+ }
return false;
}
}
@@ -281,7 +429,7 @@ public class PdfParserUtils {
private float lastY = -1;
private String currentLine = "";
private float currentX = 0;
-
+
public TextBlockStripper() throws IOException {
super();
// 初始化
@@ -290,20 +438,20 @@ public class PdfParserUtils {
currentLine = "";
currentX = 0;
}
-
+
@Override
protected void processTextPosition(TextPosition text) {
try {
float textX = text.getXDirAdj();
float textY = text.getYDirAdj();
-
+
// 如果Y坐标变化超过一定阈值,认为是新行
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
// 保存上一行
if (!currentLine.trim().isEmpty()) {
- textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+ textBlocks.add(new TextBlock(currentX, textX, lastY, currentLine));
}
-
+
// 开始新行
currentLine = text.getUnicode();
currentX = textX;
@@ -314,12 +462,14 @@ public class PdfParserUtils {
}
} catch (Exception e) {
// 忽略单个字符处理错误,继续处理其他字符
- log.debug("处理文本位置时出错: {}", e.getMessage());
+ if (enableWarningLogs) {
+ log.debug("处理文本位置时出错: {}", e.getMessage());
+ }
}
-
+
super.processTextPosition(text);
}
-
+
@Override
protected void startPage(PDPage page) throws IOException {
// 清空textBlocks列表,避免累积所有页面的内容
@@ -329,16 +479,16 @@ public class PdfParserUtils {
currentX = 0;
super.startPage(page);
}
-
+
@Override
public void endDocument(PDDocument document) throws IOException {
// 保存最后一行
if (!currentLine.trim().isEmpty()) {
- textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+ textBlocks.add(new TextBlock(currentX, currentX, lastY, currentLine));
}
super.endDocument(document);
}
-
+
public List getTextBlocks() {
return textBlocks;
}
@@ -349,23 +499,29 @@ public class PdfParserUtils {
*/
private static class TextBlock {
private final float x;
+ private final float endX;
private final float y;
private final String text;
-
- public TextBlock(float x, float y, String text) {
+
+ public TextBlock(float x, float endX, float y, String text) {
this.x = x;
+ this.endX = endX;
this.y = y;
this.text = text;
}
-
+
public float getX() {
return x;
}
-
+
+ public float getEndX() {
+ return endX;
+ }
+
public float getY() {
return y;
}
-
+
public String getText() {
return text;
}