Browse Source

sjj 功能优化,java 解析pdf

sjj_dev
zhouhaibin 2 days ago
parent
commit
e7f940fba4
  1. 6
      ruoyi-admin/pom.xml
  2. 2
      ruoyi-admin/src/main/resources/application-dev.yml
  3. 2
      ruoyi-admin/src/main/resources/application-test.yml
  4. 332
      ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
  5. 11
      zaojiaManagement/zaojia-productManagement/pom.xml
  6. 3
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java
  7. 2
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java
  8. 2
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java
  9. 2
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java
  10. 3
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java
  11. 388
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java
  12. 230
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java

6
ruoyi-admin/pom.xml

@ -198,12 +198,6 @@
<!-- </dependency>--> <!-- </dependency>-->
<!-- PDFBox 依赖 --> <!-- PDFBox 依赖 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version>
<scope>test</scope>
</dependency>
</dependencies> </dependencies>

2
ruoyi-admin/src/main/resources/application-dev.yml

@ -52,7 +52,7 @@ spring:
# url: jdbc:mysql://localhost:3306/zaojia?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true # url: jdbc:mysql://localhost:3306/zaojia?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true
# username: root # username: root
# password: root # password: root
url: jdbc:mysql://10.1.21.250:3306/sjjtable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true url: jdbc:mysql://10.1.21.250:3306/aitable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true
# url: jdbc:mysql://218.0.1.42:53306/sjjtable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true # url: jdbc:mysql://218.0.1.42:53306/sjjtable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true
username: root username: root
password: 'HXj-6nR|D8xy*h#!I&:(' password: 'HXj-6nR|D8xy*h#!I&:('

2
ruoyi-admin/src/main/resources/application-test.yml

@ -283,5 +283,5 @@ justauth:
chat: chat:
# 聊天机器人配置 # 聊天机器人配置
filePath: /guoYanXinXi/data/software/sjjapp/minio/data/sjj/ filePath: /guoYanXinXi/data/software/sjjapp/minio/data/sjj/
tempfilePath: /guoYanXinXi/data/software/sjjapp/app/tempfile/ tempfilePath: /guoYanXinXi/data/software/sjjapp/app/tempfile
chatUrl: http://127.0.0.1:8081 chatUrl: http://127.0.0.1:8081

332
ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java

@ -1,332 +0,0 @@
package org.dromara.test;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.logging.Logger;
import java.util.stream.Collectors;
/**
* PDF段落提取测试
*/
@DisplayName("PDF段落提取测试")
public class PdfExtractorTest {
private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName());
// 段落最小字数阈值
private static final int MIN_PARAGRAPH_LENGTH = 20;
// 最大缩进值
private static final float MAX_INDENT_X = 100f;
// 容差范围
private static final float TOLERANCE = 2f;
@Test
@DisplayName("测试PDF段落提取")
public void testExtractParagraphs() {
String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
List<String> paragraphs = extractParagraphsFromPdf(pdfPath);
System.out.println("提取段落总数: " + paragraphs.size());
for (int i = 0; i < paragraphs.size(); i++) {
if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){
System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim());
}
}
}
/**
* 从PDF文件中提取段落基于x坐标统计来判断段落
*/
public List<String> extractParagraphsFromPdf(String filePath) {
List<String> paragraphs = new ArrayList<>();
File pdfFile = new File(filePath);
try {
// 打开PDF文档
PDDocument document = PDDocument.load(pdfFile);
int totalPages = document.getNumberOfPages();
// 第一步:收集所有x坐标和重复文本
List<Float> xCoordinates = new ArrayList<>();
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
// 遍历每一页收集X坐标
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
final int currentPage = pageIndex; // 用于匿名类中引用
// 为每页创建文本提取器
TextBlockStripper stripper = new TextBlockStripper();
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1);
stripper.setSortByPosition(true);
stripper.getText(document);
for (TextBlock block : stripper.getTextBlocks()) {
String text = block.getText().trim();
if (text.length() > 0) {
// 统计X坐标
if (block.getX() < MAX_INDENT_X) {
xCoordinates.add(block.getX());
}
// 统计文本频率
if (text.length() >= MIN_PARAGRAPH_LENGTH) {
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
}
}
}
}
if (xCoordinates.isEmpty()) {
document.close();
return paragraphs;
}
// 找出频率超过页面数一半的文本
int frequencyThreshold = totalPages / 2;
Set<String> frequentTexts = textFrequency.entrySet().stream()
.filter(entry -> entry.getValue() > frequencyThreshold)
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)");
frequentTexts.forEach(text -> System.out.println("高频文本: " +
(text.length() > 50 ? text.substring(0, 47) + "..." : text) +
" 出现次数: " + textFrequency.get(text)));
// 第二步:统计x坐标频率并找出前两名
Map<Float, Long> xCounter = xCoordinates.stream()
.collect(Collectors.groupingBy(x -> x, Collectors.counting()));
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream()
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed())
.limit(2)
.collect(Collectors.toList());
if (mostCommonX.size() < 2) {
document.close();
return paragraphs;
}
// 确保x_indent > x_normal
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE);
// 第三步:根据基准x坐标提取段落
List<String> currentParagraph = new ArrayList<>();
int num=311;
// 逐页处理文本块
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
List<TextBlock> pageTextBlocks = new ArrayList<>();
// 为每页创建文本提取器
TextBlockStripper stripper = new TextBlockStripper();
stripper.setSortByPosition(true);
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1);
stripper.getText(document);
// 获取当前页的文本块并排序
pageTextBlocks.addAll(stripper.getTextBlocks());
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
if(pageIndex==num){
System.out.println(pageTextBlocks);
}
// 处理当前页的文本块
for (TextBlock block : pageTextBlocks) {
String lineText = block.getText().trim().replace('\n', ' ').trim();
if (lineText.isEmpty()) {
continue;
}
// 过滤高频文本
if (frequentTexts.contains(lineText)) {
if (pageIndex == num) {
System.out.println("过滤高频文本: " +
(lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText));
}
continue;
}
float currentX = block.getX();
// 判断当前x坐标属于哪种类型
boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE;
boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE;
// 如果是缩进位置,说明是新段落的开始
if (isIndent) {
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
paragraphs.add(paragraphText);
}
currentParagraph.clear();
}
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
currentParagraph.add(lineText);
}
}
// 如果是正常位置,追加到当前段落
else if (isNormal) {
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
currentParagraph.add(lineText);
} else {
currentParagraph.add(lineText);
}
}
// 如果既不是缩进也不是正常位置,作为独立段落
else {
// 如果独立段落字数满足要求进行统计,不满足要求跳过
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
paragraphs.add(paragraphText);
}
currentParagraph.clear();
}
paragraphs.add(lineText);
}
}
}
}
// 处理最后一个段落
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
paragraphs.add(paragraphText);
}
}
document.close();
} catch (IOException e) {
logger.severe("提取PDF段落失败: " + e.getMessage());
e.printStackTrace();
}
return paragraphs;
}
/**
* 用于提取文本块的PDFTextStripper
*/
private static class TextBlockStripper extends PDFTextStripper {
private final List<TextBlock> textBlocks = new ArrayList<>();
private float lastY = -1;
private String currentLine = "";
private float currentX = 0;
public TextBlockStripper() throws IOException {
super();
// 初始化
textBlocks.clear();
lastY = -1;
currentLine = "";
currentX = 0;
}
@Override
protected void processTextPosition(TextPosition text) {
float textX = text.getXDirAdj();
float textY = text.getYDirAdj();
float endX = text.getEndX();
float endY = text.getEndY();
// 如果Y坐标变化超过一定阈值,认为是新行
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
// 保存上一行
if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine));
}
// 开始新行
currentLine = text.getUnicode();
currentX = textX;
lastY = textY;
} else {
// 在同一行,追加文本
currentLine += text.getUnicode();
}
super.processTextPosition(text);
}
@Override
protected void startPage(PDPage page) throws IOException {
// 清空textBlocks列表,避免累积所有页面的内容
textBlocks.clear();
lastY = -1;
currentLine = "";
currentX = 0;
super.startPage(page);
}
@Override
public void endDocument(PDDocument document) throws IOException {
// 保存最后一行
if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine));
}
super.endDocument(document);
}
public List<TextBlock> getTextBlocks() {
return textBlocks;
}
}
/**
* 用于存储文本块信息的类
*/
private static class TextBlock {
private final float x;
private final float y;
private final String text;
public TextBlock(float x, float y, String text) {
this.x = x;
this.y = y;
this.text = text;
}
public float getX() {
return x;
}
public float getY() {
return y;
}
public String getText() {
return text;
}
@Override
public String toString() {
return "TextBlock{" +
"x=" + x +
", y=" + y +
", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' +
'}';
}
}
}

11
zaojiaManagement/zaojia-productManagement/pom.xml

@ -109,13 +109,22 @@
<artifactId>flexmark-all</artifactId> <artifactId>flexmark-all</artifactId>
<version>0.64.8</version> <version>0.64.8</version>
</dependency> </dependency>
<dependency>
<groupId>net.lingala.zip4j</groupId>
<artifactId>zip4j</artifactId>
<version>2.11.5</version>
</dependency>
<!-- HTML to PDF --> <!-- HTML to PDF -->
<dependency> <dependency>
<groupId>org.xhtmlrenderer</groupId> <groupId>org.xhtmlrenderer</groupId>
<artifactId>flying-saucer-pdf</artifactId> <artifactId>flying-saucer-pdf</artifactId>
<version>9.1.22</version> <version>9.1.22</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version>
</dependency>
</dependencies> </dependencies>

3
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java

@ -1,5 +1,6 @@
package org.dromara.productManagement.controller; package org.dromara.productManagement.controller;
import java.io.IOException;
import java.util.List; import java.util.List;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -75,7 +76,7 @@ public class SjjDocumentTasksController extends BaseController {
@Log(title = "审计局标书任务", businessType = BusinessType.INSERT) @Log(title = "审计局标书任务", businessType = BusinessType.INSERT)
@RepeatSubmit() @RepeatSubmit()
@PostMapping() @PostMapping()
public R<Void> add(@Validated(AddGroup.class) @RequestBody SjjDocumentTasksBo bo) { public R<Void> add(@Validated(AddGroup.class) @RequestBody SjjDocumentTasksBo bo) throws IOException {
return toAjax(sjjDocumentTasksService.insertByBo(bo)); return toAjax(sjjDocumentTasksService.insertByBo(bo));
} }

2
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java

@ -82,7 +82,7 @@ public class SjjDocumentTasks extends TenantEntity {
/** /**
* 投标文件对象存储ID * 投标文件对象存储ID
*/ */
private String bidDocOssId; private String bidDocZipOssId;
private String deleteFlag; private String deleteFlag;
} }

2
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java

@ -36,7 +36,7 @@ public class SjjDocumentTasksBo extends BaseEntity {
* 投标文件名称 * 投标文件名称
*/ */
private String bidDocumentName; private String bidDocumentName;
private String bidDocOssId; private String bidDocZipOssId;
private String tenderDocOssId; private String tenderDocOssId;
/** /**

2
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java

@ -69,5 +69,5 @@ public class SjjDocumentTasksVo implements Serializable {
/** /**
* 投标文件对象存储ID * 投标文件对象存储ID
*/ */
private String bidDocOssId; private String bidDocZipOssId;
} }

3
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java

@ -5,6 +5,7 @@ import org.dromara.productManagement.domain.bo.SjjDocumentTasksBo;
import org.dromara.common.mybatis.core.page.TableDataInfo; import org.dromara.common.mybatis.core.page.TableDataInfo;
import org.dromara.common.mybatis.core.page.PageQuery; import org.dromara.common.mybatis.core.page.PageQuery;
import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
@ -47,7 +48,7 @@ public interface ISjjDocumentTasksService {
* @param bo 审计局标书任务 * @param bo 审计局标书任务
* @return 是否新增成功 * @return 是否新增成功
*/ */
Boolean insertByBo(SjjDocumentTasksBo bo); Boolean insertByBo(SjjDocumentTasksBo bo) throws IOException;
/** /**
* 修改审计局标书任务 * 修改审计局标书任务

388
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java

@ -1,6 +1,8 @@
package org.dromara.productManagement.service.impl; package org.dromara.productManagement.service.impl;
import cn.dev33.satoken.stp.StpUtil; import cn.dev33.satoken.stp.StpUtil;
import net.lingala.zip4j.ZipFile;
import net.lingala.zip4j.model.FileHeader;
import okhttp3.*; import okhttp3.*;
import org.dromara.common.core.domain.model.LoginUser; import org.dromara.common.core.domain.model.LoginUser;
import org.dromara.common.core.utils.MapstructUtils; import org.dromara.common.core.utils.MapstructUtils;
@ -23,12 +25,18 @@ import org.dromara.productManagement.domain.vo.SjjDocumentTasksVo;
import org.dromara.productManagement.domain.SjjDocumentTasks; import org.dromara.productManagement.domain.SjjDocumentTasks;
import org.dromara.productManagement.mapper.SjjDocumentTasksMapper; import org.dromara.productManagement.mapper.SjjDocumentTasksMapper;
import org.dromara.productManagement.service.ISjjDocumentTasksService; import org.dromara.productManagement.service.ISjjDocumentTasksService;
import org.dromara.productManagement.utils.PdfParserUtils;
import java.io.IOException; import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Collection; import java.util.Collection;
import java.util.zip.ZipException;
/** /**
* 审计局标书任务Service业务层处理 * 审计局标书任务Service业务层处理
@ -110,18 +118,69 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
* @return 是否新增成功 * @return 是否新增成功
*/ */
@Override @Override
public Boolean insertByBo(SjjDocumentTasksBo bo) { public Boolean insertByBo(SjjDocumentTasksBo bo) throws IOException {
SjjDocumentTasks add = MapstructUtils.convert(bo, SjjDocumentTasks.class); SjjDocumentTasks add = MapstructUtils.convert(bo, SjjDocumentTasks.class);
String bidDocOssId = add.getBidDocOssId(); String bidDocZipOssId = add.getBidDocZipOssId();
String tenderDocPath="";
String tenderDocName="";
SysOssVo bidZipFileInfo = ossService.getById(Long.valueOf(bidDocZipOssId));
String bidZipName = bidZipFileInfo.getOriginalName();
String bidZipNameWithoutExt = bidZipName;
if (bidZipName.lastIndexOf(".") > 0) {
bidZipNameWithoutExt = bidZipName.substring(0, bidZipName.lastIndexOf("."));
}
String bidZipPath = fileRootPath + bidZipFileInfo.getFileName();
String tenderDocOssId = add.getTenderDocOssId(); String tenderDocOssId = add.getTenderDocOssId();
SysOssVo bidFileInfo = ossService.getById(Long.valueOf(bidDocOssId)); if(StringUtils.isNotBlank(tenderDocOssId)){
String bidDocName = bidFileInfo.getOriginalName();
String bidDocPath = fileRootPath + bidFileInfo.getFileName();
SysOssVo tenderFileInfo = ossService.getById(Long.valueOf(tenderDocOssId)); SysOssVo tenderFileInfo = ossService.getById(Long.valueOf(tenderDocOssId));
String tenderDocName = tenderFileInfo.getOriginalName(); tenderDocName = tenderFileInfo.getOriginalName();
String tenderDocPath = fileRootPath + tenderFileInfo.getFileName(); tenderDocPath = fileRootPath + tenderFileInfo.getFileName();
add.setBidDocumentName(bidDocName);
add.setTenderDocumentName(tenderDocName); add.setTenderDocumentName(tenderDocName);
}
add.setBidDocumentName(bidZipName);
// 创建唯一文件夹
String uniqueFolderName = "task_" + System.currentTimeMillis() + "_" + Math.abs(bidZipName.hashCode());
String taskFolder = tempfilePath + File.separator + uniqueFolderName;
File taskFolderDir = new File(taskFolder);
if (!taskFolderDir.exists()) {
taskFolderDir.mkdirs();
}
// 创建四个子文件夹
String bidOriginalDir = taskFolder + File.separator + "bid_original"; // 投标文件解压后的原始文件
String bidTxtDir = taskFolder + File.separator + "bid_txt"; // 投标文件解析后的TXT文件
String tenderOriginalDir = taskFolder + File.separator + "tender_original"; // 招标文件原始文件
String tenderTxtDir = taskFolder + File.separator + "tender_txt"; // 招标文件解析后的TXT文件
bidOriginalDir =bidOriginalDir+ File.separator + bidZipNameWithoutExt;
// 创建子文件夹
new File(bidOriginalDir).mkdirs();
new File(bidTxtDir).mkdirs();
new File(tenderOriginalDir).mkdirs();
new File(tenderTxtDir).mkdirs();
// 处理投标文件压缩包
processZipFile(bidZipPath, bidOriginalDir, bidTxtDir);
// 复制招标文件到任务文件夹
File tenderDoc = new File(tenderDocPath);
if (tenderDoc.exists()) {
// 复制招标文件到招标文件原始目录
File tenderDocCopy = new File(tenderOriginalDir, tenderDocName);
try (FileInputStream fis = new FileInputStream(tenderDoc);
FileOutputStream fos = new FileOutputStream(tenderDocCopy)) {
byte[] buffer = new byte[1024];
int length;
while ((length = fis.read(buffer)) > 0) {
fos.write(buffer, 0, length);
}
}
// 如果是PDF文件,解析其内容到招标文件TXT目录
if (tenderDocName.toLowerCase().endsWith(".pdf") && PdfParserUtils.isValidPdf(tenderDocCopy.getAbsolutePath())) {
processAndSavePdfContent(tenderDocCopy, tenderTxtDir, getSystemCharset());
}
}
add.setProgressStatus("PENDING"); add.setProgressStatus("PENDING");
validEntityBeforeSave(add); validEntityBeforeSave(add);
boolean flag = baseMapper.insert(add) > 0; boolean flag = baseMapper.insert(add) > 0;
@ -137,13 +196,10 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
throw new IllegalArgumentException("无效的任务名称: " + add.getTaskName()); throw new IllegalArgumentException("无效的任务名称: " + add.getTaskName());
} }
// Request request = new Request.Builder()
// .url(url+"?userId="+ LoginHelper.getUserId()+"&taskId="+taskId+"&filename="+filename+"&taskName="+taskName+"&priority="+priority)
// .build();
HttpUrl.Builder urlBuilder = HttpUrl.parse(chatUrl +"/back/taskStart").newBuilder(); HttpUrl.Builder urlBuilder = HttpUrl.parse(chatUrl +"/back/taskStart").newBuilder();
urlBuilder.addQueryParameter("userId", String.valueOf(LoginHelper.getUserId())); urlBuilder.addQueryParameter("userId", String.valueOf(LoginHelper.getUserId()));
urlBuilder.addQueryParameter("taskId", String.valueOf(add.getId())); urlBuilder.addQueryParameter("taskId", String.valueOf(add.getId()));
urlBuilder.addQueryParameter("filename", bidDocPath+"\n"+tenderDocPath); urlBuilder.addQueryParameter("filename", bidOriginalDir+"\n"+tenderOriginalDir);
urlBuilder.addQueryParameter("taskName", add.getTaskName()); urlBuilder.addQueryParameter("taskName", add.getTaskName());
urlBuilder.addQueryParameter("priority", "1"); urlBuilder.addQueryParameter("priority", "1");
Request request = new Request.Builder() Request request = new Request.Builder()
@ -166,6 +222,303 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
return flag; return flag;
} }
/**
* 处理ZIP文件解压并解析PDF
*
* @param zipFilePath ZIP文件路径
* @param originalDir 存放解压原始文件的目录
* @param txtDir 存放PDF解析后TXT文件的目录
* @throws IOException 解压或解析过程中发生IO错误
* @throws ZipException ZIP文件处理错误
*/
private void processZipFile(String zipFilePath, String originalDir, String txtDir) throws IOException, ZipException {
// 创建解压目标目录(如果不存在)
File extractDirFile = new File(originalDir);
if (!extractDirFile.exists()) {
extractDirFile.mkdirs();
}
// 检测最佳编码
Charset bestCharset = detectBestCharset(zipFilePath);
try {
// 使用zip4j解压文件
ZipFile zipFile = new ZipFile(zipFilePath);
zipFile.setCharset(bestCharset);
// 获取所有文件头
List<FileHeader> fileHeaders = zipFile.getFileHeaders();
for (FileHeader fileHeader : fileHeaders) {
// 跳过目录项
if (fileHeader.isDirectory()) {
continue;
}
try {
// 获取文件名(不包括路径)
String fileName = new File(fileHeader.getFileName()).getName();
// 提取到指定目录,使用新的文件名
zipFile.extractFile(fileHeader, originalDir, fileName);
} catch (Exception e) {
// 如果使用检测到的编码解压失败,使用系统默认编码重试
try {
ZipFile fallbackZipFile = new ZipFile(zipFilePath);
fallbackZipFile.setCharset(getSystemCharset());
fallbackZipFile.extractFile(fileHeader.getFileName(), originalDir);
} catch (Exception fallbackEx) {
System.err.println("解压文件失败: " + fileHeader.getFileName() + ", 错误: " + fallbackEx.getMessage());
}
}
}
// 递归处理所有PDF文件
processAllPdfFiles(extractDirFile, txtDir, bestCharset);
} catch (Exception e) {
// 如果使用检测的编码失败,尝试直接整体解压
try {
ZipFile zipFile = new ZipFile(zipFilePath);
zipFile.setCharset(getSystemCharset());
zipFile.extractAll(originalDir);
processAllPdfFiles(extractDirFile, txtDir, getSystemCharset());
} catch (Exception e2) {
System.err.println("解压失败: " + e2.getMessage());
throw new IOException("解压失败", e2);
}
}
}
/**
* 检测ZIP文件的最佳字符编码
* 通过对比不同编码下的文件名可读性来确定最佳编码
*/
private Charset detectBestCharset(String zipFilePath) {
// 常用的中文编码
Charset[] charsets = {
Charset.forName("GB18030"), // 首选,覆盖面最广的中文编码
Charset.forName("GBK"), // 次选,常用中文编码
StandardCharsets.UTF_8, // 通用编码
getSystemCharset() // 系统默认编码
};
int bestScore = -1;
Charset bestCharset = getSystemCharset(); // 默认使用系统字符集
try {
// 尝试每种编码并评分
for (Charset charset : charsets) {
int score = evaluateCharsetForZip(zipFilePath, charset);
if (score > bestScore) {
bestScore = score;
bestCharset = charset;
}
}
} catch (Exception e) {
System.err.println("检测字符集时出错: " + e.getMessage());
}
System.out.println("为ZIP文件选择的最佳字符集: " + bestCharset.name());
return bestCharset;
}
/**
* 评估特定字符集对ZIP文件的适用性
* 返回评分值分数越高表示编码越适合
*/
private int evaluateCharsetForZip(String zipFilePath, Charset charset) {
int score = 0;
try {
ZipFile zipFile = new ZipFile(zipFilePath);
zipFile.setCharset(charset);
List<FileHeader> fileHeaders = zipFile.getFileHeaders();
for (FileHeader fileHeader : fileHeaders) {
if (fileHeader.isDirectory()) continue;
String fileName = fileHeader.getFileName();
score += evaluateString(fileName, charset);
}
} catch (Exception e) {
// 如果使用此编码打开ZIP失败,得分为-1
return -1;
}
return score;
}
/**
* 评估字符串在特定编码下的可读性
* 检查是否包含乱码字符返回可读性得分
*/
private int evaluateString(String str, Charset charset) {
int score = 0;
try {
// 将字符串转换为字节,然后再转回来,检查是否有信息丢失
byte[] bytes = str.getBytes(charset);
String decoded = new String(bytes, charset);
if (str.equals(decoded)) {
score += 10; // 完全匹配加10分
}
// 检查是否包含常见乱码字符
score -= countCharactersInRange(str, 0xFFFD, 0xFFFD); // Unicode替换字符
score -= countCharactersInRange(str, 0xD800, 0xDFFF) * 2; // Unicode代理区域
// 检查特殊字符比例
int specialChars = countSpecialCharacters(str);
if (specialChars > str.length() / 3) {
score -= 5; // 特殊字符过多,扣分
}
// 检查中文字符的存在
int chineseChars = countChineseCharacters(str);
if (chineseChars > 0) {
score += 5; // 包含中文字符加分
}
} catch (Exception e) {
score -= 10; // 转换异常,大幅扣分
}
return score;
}
/**
* 计算字符串中特定Unicode范围内的字符数量
*/
private int countCharactersInRange(String str, int start, int end) {
int count = 0;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c >= start && c <= end) {
count++;
}
}
return count;
}
/**
* 计算字符串中特殊字符非字母数字常用标点的数量
*/
private int countSpecialCharacters(String str) {
int count = 0;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (!Character.isLetterOrDigit(c) && !isCommonPunctuation(c)) {
count++;
}
}
return count;
}
/**
* 判断字符是否为常用标点符号
*/
private boolean isCommonPunctuation(char c) {
return c == '.' || c == ',' || c == ';' || c == ':' || c == '!' ||
c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
c == '{' || c == '}' || c == '_' || c == '-' || c == ' ' ||
c == '/' || c == '\\';
}
/**
* 计算字符串中中文字符的数量
*/
private int countChineseCharacters(String str) {
int count = 0;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (isChinese(c)) {
count++;
}
}
return count;
}
/**
* 判断字符是否为中文字符
*/
private boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
return ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A;
}
/**
* 递归处理目录中的所有PDF文件
*
* @param directory 需要处理的目录
* @param txtOutputDir PDF解析后的TXT文件输出目录
* @param charset 字符集
*/
private void processAllPdfFiles(File directory, String txtOutputDir, Charset charset) {
if (!directory.isDirectory()) {
return;
}
File[] files = directory.listFiles();
if (files == null) {
return;
}
for (File file : files) {
if (file.isDirectory()) {
// 递归处理子目录
processAllPdfFiles(file, txtOutputDir, charset);
} else if (file.getName().toLowerCase().endsWith(".pdf") && PdfParserUtils.isValidPdf(file.getAbsolutePath())) {
// 处理PDF文件
processAndSavePdfContent(file, txtOutputDir, charset);
}
}
}
/**
* 获取系统默认字符集
*
* @return 适合当前操作系统的字符集
*/
private Charset getSystemCharset() {
String osName = System.getProperty("os.name").toLowerCase();
return osName.contains("win") ? Charset.forName("GBK") : StandardCharsets.UTF_8;
}
/**
* 处理PDF文件并保存为TXT
*
* @param pdfFile PDF文件
* @param outputDir 输出目录
* @param charset 字符集
*/
private void processAndSavePdfContent(File pdfFile, String outputDir, Charset charset) {
try {
// 提取PDF段落
List<String> paragraphs = PdfParserUtils.extractParagraphs(pdfFile.getAbsolutePath());
if (paragraphs.isEmpty()) {
return;
}
// 创建TXT文件名(替换扩展名)
String txtFileName = pdfFile.getName().replaceAll("\\.pdf$", ".txt");
File txtFile = new File(outputDir, txtFileName);
// 写入TXT文件
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(txtFile), charset))) {
for (String paragraph : paragraphs) {
writer.write(paragraph);
writer.newLine();
writer.newLine(); // 段落间添加空行
}
}
} catch (Exception e) {
}
}
/** /**
* 修改审计局标书任务 * 修改审计局标书任务
* *
@ -205,8 +558,14 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
public Boolean ossRemoveById(List<String> ids, Boolean b) { public Boolean ossRemoveById(List<String> ids, Boolean b) {
List<SjjDocumentTasksVo> sjjDocumentTasksVos = baseMapper.selectVoByIds(ids); List<SjjDocumentTasksVo> sjjDocumentTasksVos = baseMapper.selectVoByIds(ids);
for (SjjDocumentTasksVo sjjDocumentTasksVo : sjjDocumentTasksVos) { for (SjjDocumentTasksVo sjjDocumentTasksVo : sjjDocumentTasksVos) {
if (sjjDocumentTasksVo.getTenderDocOssId() != null ) {
ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getTenderDocOssId())), true); ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getTenderDocOssId())), true);
ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getBidDocOssId())), true);
}
if (sjjDocumentTasksVo.getBidDocZipOssId() != null ) {
ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getBidDocZipOssId())), true);
}
SjjDocumentTasks convert = MapstructUtils.convert(sjjDocumentTasksVo, SjjDocumentTasks.class); SjjDocumentTasks convert = MapstructUtils.convert(sjjDocumentTasksVo, SjjDocumentTasks.class);
convert.setDeleteFlag("Y"); convert.setDeleteFlag("Y");
baseMapper.updateById(convert); baseMapper.updateById(convert);
@ -214,3 +573,4 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService {
return true; return true;
} }
} }

230
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java

@ -11,7 +11,10 @@ import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream;
/** /**
* PDF解析工具类 * PDF解析工具类
@ -30,12 +33,26 @@ public class PdfParserUtils {
private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class); private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class);
// 是否打印警告日志
private static boolean enableWarningLogs = false;
// 默认段落最小字数阈值 // 默认段落最小字数阈值
private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20; private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20;
// 默认最大缩进值 // 默认最大缩进值
private static final float DEFAULT_MAX_INDENT_X = 100f; private static final float DEFAULT_MAX_INDENT_X = 100f;
// 默认容差范围 // 默认容差范围
private static final float DEFAULT_TOLERANCE = 2f; private static final float DEFAULT_TOLERANCE = 2f;
// 默认线程池大小
private static final int DEFAULT_THREAD_POOL_SIZE = 32;
/**
* 设置是否启用警告日志
*
* @param enable 是否启用
*/
public static void setEnableWarningLogs(boolean enable) {
enableWarningLogs = enable;
}
/** /**
* 从PDF文件中提取段落 * 从PDF文件中提取段落
@ -57,6 +74,20 @@ public class PdfParserUtils {
* @return 提取的段落列表 * @return 提取的段落列表
*/ */
public static List<String> extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) { public static List<String> extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) {
return extractParagraphs(filePath, minParagraphLength, maxIndentX, tolerance, DEFAULT_THREAD_POOL_SIZE);
}
/**
* 从PDF文件中提取段落支持自定义参数和线程池大小
*
* @param filePath PDF文件路径
* @param minParagraphLength 最小段落长度
* @param maxIndentX 最大缩进值
* @param tolerance 容差范围
* @param threadPoolSize 线程池大小
* @return 提取的段落列表
*/
public static List<String> extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance, int threadPoolSize) {
List<String> paragraphs = new ArrayList<>(); List<String> paragraphs = new ArrayList<>();
File pdfFile = new File(filePath); File pdfFile = new File(filePath);
@ -65,9 +96,20 @@ public class PdfParserUtils {
return paragraphs; return paragraphs;
} }
ExecutorService executor = null;
try { try {
// 设置PDFBox选项,抑制字体警告 // 设置PDFBox选项,抑制字体警告
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true"); System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
// 关闭PDFBox内部的警告日志
java.util.logging.Logger.getLogger("org.apache.pdfbox").setLevel(java.util.logging.Level.SEVERE);
// 关闭FontBox相关的警告
java.util.logging.Logger.getLogger("org.apache.fontbox").setLevel(java.util.logging.Level.SEVERE);
// 禁用PDFBox字体警告的另一种方式
System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog");
// 设置PDF处理相关配置
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
// 忽略字体缺失警告
System.setProperty("org.apache.pdfbox.fontcache", "none");
// 打开PDF文档 // 打开PDF文档
PDDocument document = PDDocument.load(pdfFile); PDDocument document = PDDocument.load(pdfFile);
@ -77,20 +119,25 @@ public class PdfParserUtils {
int totalPages = document.getNumberOfPages(); int totalPages = document.getNumberOfPages();
// 创建线程池
executor = Executors.newFixedThreadPool(Math.min(threadPoolSize, totalPages));
// 第一步:收集所有x坐标和重复文本 // 第一步:收集所有x坐标和重复文本
List<Float> xCoordinates = new ArrayList<>(); ConcurrentLinkedQueue<Float> xCoordinates = new ConcurrentLinkedQueue<>();
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率 ConcurrentLinkedQueue<Float> endXCoordinates = new ConcurrentLinkedQueue<>();
ConcurrentHashMap<String, AtomicInteger> textFrequency = new ConcurrentHashMap<>(); // 记录文本出现频率
log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages); log.info("开始多线程解析PDF文件: {}, 总页数: {}, 线程池大小: {}", filePath, totalPages, threadPoolSize);
// 遍历每一页收集X坐标 // 创建任务列表,每个任务处理一页
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { List<Callable<Void>> tasks = IntStream.range(0, totalPages)
.mapToObj(pageIndex -> (Callable<Void>) () -> {
try { try {
// 为每页创建文本提取器 // 为每页创建文本提取器
TextBlockStripper stripper = new TextBlockStripper(); TextBlockStripper stripper = new TextBlockStripper();
stripper.setSortByPosition(true); stripper.setSortByPosition(true);
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失 stripper.setSuppressDuplicateOverlappingText(false);
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性 stripper.setAddMoreFormatting(false);
stripper.setStartPage(pageIndex + 1); stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1); stripper.setEndPage(pageIndex + 1);
stripper.getText(document); stripper.getText(document);
@ -102,36 +149,49 @@ public class PdfParserUtils {
if (block.getX() < maxIndentX) { if (block.getX() < maxIndentX) {
xCoordinates.add(block.getX()); xCoordinates.add(block.getX());
} }
// 统计终点X坐标
endXCoordinates.add(block.getEndX());
// 统计文本频率 // 统计文本频率(线程安全方式)
if (text.length() >= minParagraphLength) { if (text.length() >= minParagraphLength) {
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); textFrequency.computeIfAbsent(text, k -> new AtomicInteger(0)).incrementAndGet();
} }
} }
} }
} catch (Exception e) { } catch (Exception e) {
if (enableWarningLogs) {
log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage()); log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage());
// 继续处理下一页,而不是中断整个过程
} }
} }
return null;
})
.collect(Collectors.toList());
// 提交所有任务并等待完成
executor.invokeAll(tasks);
if (xCoordinates.isEmpty()) { if (xCoordinates.isEmpty() || endXCoordinates.isEmpty()) {
if (enableWarningLogs) {
log.warn("未找到有效的X坐标,无法提取段落"); log.warn("未找到有效的X坐标,无法提取段落");
}
document.close(); document.close();
if (executor != null) {
executor.shutdown();
}
return paragraphs; return paragraphs;
} }
// 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容) // 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容)
int frequencyThreshold = totalPages / 2; int frequencyThreshold = totalPages / 2;
Set<String> frequentTexts = textFrequency.entrySet().stream() Set<String> frequentTexts = textFrequency.entrySet().stream()
.filter(entry -> entry.getValue() > frequencyThreshold) .filter(entry -> entry.getValue().get() > frequencyThreshold)
.map(Map.Entry::getKey) .map(Map.Entry::getKey)
.collect(Collectors.toSet()); .collect(Collectors.toSet());
log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold); log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold);
// 统计x坐标频率并找出前两名(通常是正常段落和首行缩进) // 统计x坐标频率并找出前两名(通常是正常段落和首行缩进)
Map<Float, Long> xCounter = xCoordinates.stream() Map<Float, Long> xCounter = new ArrayList<>(xCoordinates).stream()
.collect(Collectors.groupingBy(x -> x, Collectors.counting())); .collect(Collectors.groupingBy(x -> x, Collectors.counting()));
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream() List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream()
@ -139,24 +199,41 @@ public class PdfParserUtils {
.limit(2) .limit(2)
.collect(Collectors.toList()); .collect(Collectors.toList());
if (mostCommonX.size() < 2) { // 统计终点x坐标频率并找出最常见的值
Map<Float, Long> endXCounter = new ArrayList<>(endXCoordinates).stream()
.collect(Collectors.groupingBy(x -> x, Collectors.counting()));
List<Map.Entry<Float, Long>> mostCommonEndX = endXCounter.entrySet().stream()
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed())
.limit(1)
.collect(Collectors.toList());
if (mostCommonX.size() < 2 || mostCommonEndX.isEmpty()) {
if (enableWarningLogs) {
log.warn("未找到足够的X坐标特征,无法区分段落缩进"); log.warn("未找到足够的X坐标特征,无法区分段落缩进");
}
document.close(); document.close();
if (executor != null) {
executor.shutdown();
}
return paragraphs; return paragraphs;
} }
// 确保x_indent > x_normal // 确保x_indent > x_normal
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标 float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标 float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
float commonEndX = mostCommonEndX.get(0).getKey(); // 最常见的终点x坐标
log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance); log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}, commonEndX={}", xNormal, xIndent, tolerance, commonEndX);
// 根据基准x坐标提取段落 // 使用ConcurrentMap存储每页的段落
List<String> currentParagraph = new ArrayList<>(); ConcurrentHashMap<Integer, List<String>> pageParagraphs = new ConcurrentHashMap<>();
// 逐页处理文本块 // 创建段落提取任务
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { List<Callable<Void>> paragraphTasks = IntStream.range(0, totalPages)
.mapToObj(pageIndex -> (Callable<Void>) () -> {
try { try {
List<String> pageResult = new ArrayList<>();
List<TextBlock> pageTextBlocks = new ArrayList<>(); List<TextBlock> pageTextBlocks = new ArrayList<>();
// 为每页创建文本提取器 // 为每页创建文本提取器
@ -172,8 +249,13 @@ public class PdfParserUtils {
pageTextBlocks.addAll(stripper.getTextBlocks()); pageTextBlocks.addAll(stripper.getTextBlocks());
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
// 每页独立处理段落
List<String> currentParagraph = new ArrayList<>();
boolean isInParagraph = false;
// 处理当前页的文本块 // 处理当前页的文本块
for (TextBlock block : pageTextBlocks) { for (int i = 0; i < pageTextBlocks.size(); i++) {
TextBlock block = pageTextBlocks.get(i);
String lineText = block.getText().trim().replace('\n', ' ').trim(); String lineText = block.getText().trim().replace('\n', ' ').trim();
if (lineText.isEmpty()) { if (lineText.isEmpty()) {
continue; continue;
@ -185,6 +267,7 @@ public class PdfParserUtils {
} }
float currentX = block.getX(); float currentX = block.getX();
float currentEndX = block.getEndX();
// 判断当前x坐标属于哪种类型 // 判断当前x坐标属于哪种类型
boolean isIndent = Math.abs(currentX - xIndent) <= tolerance; boolean isIndent = Math.abs(currentX - xIndent) <= tolerance;
@ -195,40 +278,73 @@ public class PdfParserUtils {
if (!currentParagraph.isEmpty()) { if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph); String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) { if (paragraphText.length() >= minParagraphLength) {
paragraphs.add(paragraphText.trim()); pageResult.add(paragraphText.trim());
} }
currentParagraph.clear(); currentParagraph.clear();
} }
if (lineText.length() >= minParagraphLength) { if (lineText.length() >= minParagraphLength) {
currentParagraph.add(lineText); currentParagraph.add(lineText);
isInParagraph = true;
} }
} }
// 如果是正常位置,追加到当前段落 // 如果是正常位置
else if (isNormal) { else if (isNormal) {
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落 if (!isInParagraph) {
// 检查是否是段落的开始(终点x在最大值范围内)
if (currentEndX <= commonEndX * 0.95) { // 使用95%作为阈值
isInParagraph = true;
currentParagraph.add(lineText);
}
} else {
// 检查是否应该结束当前段落
if (currentEndX > commonEndX * 0.95) {
// 当前行结束,检查下一行
if (i + 1 < pageTextBlocks.size()) {
TextBlock nextBlock = pageTextBlocks.get(i + 1);
float nextX = nextBlock.getX();
boolean nextIsNormal = Math.abs(nextX - xNormal) <= tolerance;
if (!nextIsNormal) {
// 下一行不是正常位置,结束当前段落
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) {
pageResult.add(paragraphText.trim());
}
currentParagraph.clear();
isInParagraph = false;
} else {
// 下一行是正常位置,继续当前段落
currentParagraph.add(lineText); currentParagraph.add(lineText);
}
} else {
// 已经是最后一行,结束当前段落
currentParagraph.add(lineText);
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) {
pageResult.add(paragraphText.trim());
}
currentParagraph.clear();
isInParagraph = false;
}
} else { } else {
// 继续当前段落
currentParagraph.add(lineText); currentParagraph.add(lineText);
} }
} }
}
// 如果既不是缩进也不是正常位置,作为独立段落 // 如果既不是缩进也不是正常位置,作为独立段落
else { else {
// 如果独立段落字数满足要求进行统计,不满足要求跳过
if (lineText.length() >= minParagraphLength) {
if (!currentParagraph.isEmpty()) { if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph); String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) { if (paragraphText.length() >= minParagraphLength) {
paragraphs.add(paragraphText.trim()); pageResult.add(paragraphText.trim());
} }
currentParagraph.clear(); currentParagraph.clear();
isInParagraph = false;
} }
paragraphs.add(lineText.trim()); if (lineText.length() >= minParagraphLength) {
} pageResult.add(lineText.trim());
}
} }
} catch (Exception e) {
log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage());
// 继续处理下一页
} }
} }
@ -236,15 +352,45 @@ public class PdfParserUtils {
if (!currentParagraph.isEmpty()) { if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph); String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) { if (paragraphText.length() >= minParagraphLength) {
paragraphs.add(paragraphText.trim()); pageResult.add(paragraphText.trim());
}
}
// 保存当前页的结果
pageParagraphs.put(pageIndex, pageResult);
} catch (Exception e) {
if (enableWarningLogs) {
log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage());
}
}
return null;
})
.collect(Collectors.toList());
// 提交所有段落提取任务并等待完成
executor.invokeAll(paragraphTasks);
// 按页码顺序合并所有段落
for (int i = 0; i < totalPages; i++) {
List<String> pageParagraphList = pageParagraphs.get(i);
if (pageParagraphList != null) {
paragraphs.addAll(pageParagraphList);
} }
} }
document.close(); document.close();
log.info("PDF解析完成,提取段落数: {}", paragraphs.size()); log.info("PDF多线程解析完成,提取段落数: {}", paragraphs.size());
} catch (IOException e) { } catch (IOException e) {
log.error("提取PDF段落失败: {}", e.getMessage(), e); log.error("提取PDF段落失败: {}", e.getMessage(), e);
} catch (InterruptedException e) {
log.error("多线程处理被中断: {}", e.getMessage(), e);
Thread.currentThread().interrupt();
} finally {
if (executor != null) {
executor.shutdown();
}
} }
return paragraphs; return paragraphs;
@ -268,7 +414,9 @@ public class PdfParserUtils {
document.close(); document.close();
return pageCount > 0; return pageCount > 0;
} catch (Exception e) { } catch (Exception e) {
if (enableWarningLogs) {
log.error("检查PDF有效性时出错: {}", e.getMessage()); log.error("检查PDF有效性时出错: {}", e.getMessage());
}
return false; return false;
} }
} }
@ -301,7 +449,7 @@ public class PdfParserUtils {
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
// 保存上一行 // 保存上一行
if (!currentLine.trim().isEmpty()) { if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); textBlocks.add(new TextBlock(currentX, textX, lastY, currentLine));
} }
// 开始新行 // 开始新行
@ -314,8 +462,10 @@ public class PdfParserUtils {
} }
} catch (Exception e) { } catch (Exception e) {
// 忽略单个字符处理错误,继续处理其他字符 // 忽略单个字符处理错误,继续处理其他字符
if (enableWarningLogs) {
log.debug("处理文本位置时出错: {}", e.getMessage()); log.debug("处理文本位置时出错: {}", e.getMessage());
} }
}
super.processTextPosition(text); super.processTextPosition(text);
} }
@ -334,7 +484,7 @@ public class PdfParserUtils {
public void endDocument(PDDocument document) throws IOException { public void endDocument(PDDocument document) throws IOException {
// 保存最后一行 // 保存最后一行
if (!currentLine.trim().isEmpty()) { if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); textBlocks.add(new TextBlock(currentX, currentX, lastY, currentLine));
} }
super.endDocument(document); super.endDocument(document);
} }
@ -349,11 +499,13 @@ public class PdfParserUtils {
*/ */
private static class TextBlock { private static class TextBlock {
private final float x; private final float x;
private final float endX;
private final float y; private final float y;
private final String text; private final String text;
public TextBlock(float x, float y, String text) { public TextBlock(float x, float endX, float y, String text) {
this.x = x; this.x = x;
this.endX = endX;
this.y = y; this.y = y;
this.text = text; this.text = text;
} }
@ -362,6 +514,10 @@ public class PdfParserUtils {
return x; return x;
} }
public float getEndX() {
return endX;
}
public float getY() { public float getY() {
return y; return y;
} }

Loading…
Cancel
Save