diff --git a/ruoyi-admin/pom.xml b/ruoyi-admin/pom.xml index f3592cb..753e96a 100644 --- a/ruoyi-admin/pom.xml +++ b/ruoyi-admin/pom.xml @@ -198,12 +198,6 @@ - - org.apache.pdfbox - pdfbox - 2.0.27 - test - diff --git a/ruoyi-admin/src/main/resources/application-dev.yml b/ruoyi-admin/src/main/resources/application-dev.yml index 0638d2d..00042d9 100644 --- a/ruoyi-admin/src/main/resources/application-dev.yml +++ b/ruoyi-admin/src/main/resources/application-dev.yml @@ -52,7 +52,7 @@ spring: # url: jdbc:mysql://localhost:3306/zaojia?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true # username: root # password: root - url: jdbc:mysql://10.1.21.250:3306/sjjtable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true + url: jdbc:mysql://10.1.21.250:3306/aitable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true # url: jdbc:mysql://218.0.1.42:53306/sjjtable?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=true&serverTimezone=GMT%2B8&autoReconnect=true&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true&nullCatalogMeansCurrent=true username: root password: 'HXj-6nR|D8xy*h#!I&:(' diff --git a/ruoyi-admin/src/main/resources/application-test.yml b/ruoyi-admin/src/main/resources/application-test.yml index 18cce78..1c2ff48 100644 --- a/ruoyi-admin/src/main/resources/application-test.yml +++ b/ruoyi-admin/src/main/resources/application-test.yml @@ -283,5 +283,5 @@ justauth: chat: # 聊天机器人配置 filePath: /guoYanXinXi/data/software/sjjapp/minio/data/sjj/ - tempfilePath: /guoYanXinXi/data/software/sjjapp/app/tempfile/ + tempfilePath: /guoYanXinXi/data/software/sjjapp/app/tempfile chatUrl: http://127.0.0.1:8081 diff --git a/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java b/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java deleted file mode 100644 index cccd676..0000000 --- a/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java +++ /dev/null @@ -1,332 +0,0 @@ -package org.dromara.test; - -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.text.PDFTextStripper; -import org.apache.pdfbox.text.TextPosition; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; -import java.util.*; -import java.util.logging.Logger; -import java.util.stream.Collectors; - -/** - * PDF段落提取测试 - */ -@DisplayName("PDF段落提取测试") -public class PdfExtractorTest { - - private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName()); - - // 段落最小字数阈值 - private static final int MIN_PARAGRAPH_LENGTH = 20; - // 最大缩进值 - private static final float MAX_INDENT_X = 100f; - // 容差范围 - private static final float TOLERANCE = 2f; - - @Test - @DisplayName("测试PDF段落提取") - public void testExtractParagraphs() { - String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径 - List paragraphs = extractParagraphsFromPdf(pdfPath); - - System.out.println("提取段落总数: " + paragraphs.size()); - for (int i = 0; i < paragraphs.size(); i++) { - if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){ - System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim()); - } - } - } - - /** - * 从PDF文件中提取段落,基于x坐标统计来判断段落 - */ - public List extractParagraphsFromPdf(String filePath) { - List paragraphs = new ArrayList<>(); - File pdfFile = new File(filePath); - - try { - // 打开PDF文档 - PDDocument document = PDDocument.load(pdfFile); - int totalPages = document.getNumberOfPages(); - - // 第一步:收集所有x坐标和重复文本 - List xCoordinates = new ArrayList<>(); - Map textFrequency = new HashMap<>(); // 记录文本出现频率 - - // 遍历每一页收集X坐标 - for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { - final int currentPage = pageIndex; // 用于匿名类中引用 - - // 为每页创建文本提取器 - TextBlockStripper stripper = new TextBlockStripper(); - stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失 - stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性 - stripper.setStartPage(pageIndex + 1); - stripper.setEndPage(pageIndex + 1); - stripper.setSortByPosition(true); - stripper.getText(document); - - for (TextBlock block : stripper.getTextBlocks()) { - String text = block.getText().trim(); - if (text.length() > 0) { - // 统计X坐标 - if (block.getX() < MAX_INDENT_X) { - xCoordinates.add(block.getX()); - } - - // 统计文本频率 - if (text.length() >= MIN_PARAGRAPH_LENGTH) { - textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); - } - } - } - } - - if (xCoordinates.isEmpty()) { - document.close(); - return paragraphs; - } - - // 找出频率超过页面数一半的文本 - int frequencyThreshold = totalPages / 2; - Set frequentTexts = textFrequency.entrySet().stream() - .filter(entry -> entry.getValue() > frequencyThreshold) - .map(Map.Entry::getKey) - .collect(Collectors.toSet()); - - System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)"); - frequentTexts.forEach(text -> System.out.println("高频文本: " + - (text.length() > 50 ? text.substring(0, 47) + "..." : text) + - " 出现次数: " + textFrequency.get(text))); - - // 第二步:统计x坐标频率并找出前两名 - Map xCounter = xCoordinates.stream() - .collect(Collectors.groupingBy(x -> x, Collectors.counting())); - - List> mostCommonX = xCounter.entrySet().stream() - .sorted(Map.Entry.comparingByValue().reversed()) - .limit(2) - .collect(Collectors.toList()); - - if (mostCommonX.size() < 2) { - document.close(); - return paragraphs; - } - - // 确保x_indent > x_normal - float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标 - float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标 - - System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE); - - // 第三步:根据基准x坐标提取段落 - List currentParagraph = new ArrayList<>(); - int num=311; - - // 逐页处理文本块 - for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { - List pageTextBlocks = new ArrayList<>(); - - // 为每页创建文本提取器 - TextBlockStripper stripper = new TextBlockStripper(); - stripper.setSortByPosition(true); - stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失 - stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性 - stripper.setStartPage(pageIndex + 1); - stripper.setEndPage(pageIndex + 1); - stripper.getText(document); - - // 获取当前页的文本块并排序 - pageTextBlocks.addAll(stripper.getTextBlocks()); - pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); - - if(pageIndex==num){ - System.out.println(pageTextBlocks); - } - - // 处理当前页的文本块 - for (TextBlock block : pageTextBlocks) { - String lineText = block.getText().trim().replace('\n', ' ').trim(); - if (lineText.isEmpty()) { - continue; - } - - // 过滤高频文本 - if (frequentTexts.contains(lineText)) { - if (pageIndex == num) { - System.out.println("过滤高频文本: " + - (lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText)); - } - continue; - } - - float currentX = block.getX(); - - // 判断当前x坐标属于哪种类型 - boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE; - boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE; - - // 如果是缩进位置,说明是新段落的开始 - if (isIndent) { - if (!currentParagraph.isEmpty()) { - String paragraphText = String.join("", currentParagraph); - if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { - paragraphs.add(paragraphText); - } - currentParagraph.clear(); - } - if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { - currentParagraph.add(lineText); - } - } - // 如果是正常位置,追加到当前段落 - else if (isNormal) { - if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落 - currentParagraph.add(lineText); - } else { - currentParagraph.add(lineText); - } - } - // 如果既不是缩进也不是正常位置,作为独立段落 - else { - // 如果独立段落字数满足要求进行统计,不满足要求跳过 - if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { - if (!currentParagraph.isEmpty()) { - String paragraphText = String.join("", currentParagraph); - if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { - paragraphs.add(paragraphText); - } - currentParagraph.clear(); - } - paragraphs.add(lineText); - } - } - } - } - - // 处理最后一个段落 - if (!currentParagraph.isEmpty()) { - String paragraphText = String.join("", currentParagraph); - if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { - paragraphs.add(paragraphText); - } - } - - document.close(); - - } catch (IOException e) { - logger.severe("提取PDF段落失败: " + e.getMessage()); - e.printStackTrace(); - } - - return paragraphs; - } - - /** - * 用于提取文本块的PDFTextStripper - */ - private static class TextBlockStripper extends PDFTextStripper { - private final List textBlocks = new ArrayList<>(); - private float lastY = -1; - private String currentLine = ""; - private float currentX = 0; - - public TextBlockStripper() throws IOException { - super(); - // 初始化 - textBlocks.clear(); - lastY = -1; - currentLine = ""; - currentX = 0; - } - - @Override - protected void processTextPosition(TextPosition text) { - float textX = text.getXDirAdj(); - float textY = text.getYDirAdj(); - float endX = text.getEndX(); - float endY = text.getEndY(); - // 如果Y坐标变化超过一定阈值,认为是新行 - if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { - // 保存上一行 - if (!currentLine.trim().isEmpty()) { - textBlocks.add(new TextBlock(currentX, lastY, currentLine)); - } - - // 开始新行 - currentLine = text.getUnicode(); - currentX = textX; - lastY = textY; - } else { - // 在同一行,追加文本 - currentLine += text.getUnicode(); - } - - super.processTextPosition(text); - } - - @Override - protected void startPage(PDPage page) throws IOException { - // 清空textBlocks列表,避免累积所有页面的内容 - textBlocks.clear(); - lastY = -1; - currentLine = ""; - currentX = 0; - super.startPage(page); - } - - @Override - public void endDocument(PDDocument document) throws IOException { - // 保存最后一行 - if (!currentLine.trim().isEmpty()) { - textBlocks.add(new TextBlock(currentX, lastY, currentLine)); - } - super.endDocument(document); - } - - public List getTextBlocks() { - return textBlocks; - } - } - - /** - * 用于存储文本块信息的类 - */ - private static class TextBlock { - private final float x; - private final float y; - private final String text; - - public TextBlock(float x, float y, String text) { - this.x = x; - this.y = y; - this.text = text; - } - - public float getX() { - return x; - } - - public float getY() { - return y; - } - - public String getText() { - return text; - } - - @Override - public String toString() { - return "TextBlock{" + - "x=" + x + - ", y=" + y + - ", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' + - '}'; - } - } -} diff --git a/zaojiaManagement/zaojia-productManagement/pom.xml b/zaojiaManagement/zaojia-productManagement/pom.xml index c270600..dc0d034 100644 --- a/zaojiaManagement/zaojia-productManagement/pom.xml +++ b/zaojiaManagement/zaojia-productManagement/pom.xml @@ -109,13 +109,22 @@ flexmark-all 0.64.8 - + + net.lingala.zip4j + zip4j + 2.11.5 + org.xhtmlrenderer flying-saucer-pdf 9.1.22 + + org.apache.pdfbox + pdfbox + 2.0.27 + diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java index 1d012db..69482c5 100644 --- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/controller/SjjDocumentTasksController.java @@ -1,5 +1,6 @@ package org.dromara.productManagement.controller; +import java.io.IOException; import java.util.List; import lombok.RequiredArgsConstructor; @@ -75,7 +76,7 @@ public class SjjDocumentTasksController extends BaseController { @Log(title = "审计局标书任务", businessType = BusinessType.INSERT) @RepeatSubmit() @PostMapping() - public R add(@Validated(AddGroup.class) @RequestBody SjjDocumentTasksBo bo) { + public R add(@Validated(AddGroup.class) @RequestBody SjjDocumentTasksBo bo) throws IOException { return toAjax(sjjDocumentTasksService.insertByBo(bo)); } diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java index 1b7983a..0b957d3 100644 --- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/SjjDocumentTasks.java @@ -82,7 +82,7 @@ public class SjjDocumentTasks extends TenantEntity { /** * 投标文件对象存储ID */ - private String bidDocOssId; + private String bidDocZipOssId; private String deleteFlag; } diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java index 2ae93ab..0385e81 100644 --- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/bo/SjjDocumentTasksBo.java @@ -36,7 +36,7 @@ public class SjjDocumentTasksBo extends BaseEntity { * 投标文件名称 */ private String bidDocumentName; - private String bidDocOssId; + private String bidDocZipOssId; private String tenderDocOssId; /** diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java index abb039f..fbd9a16 100644 --- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/domain/vo/SjjDocumentTasksVo.java @@ -69,5 +69,5 @@ public class SjjDocumentTasksVo implements Serializable { /** * 投标文件对象存储ID */ - private String bidDocOssId; + private String bidDocZipOssId; } diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java index df9758c..af4816d 100644 --- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/ISjjDocumentTasksService.java @@ -5,6 +5,7 @@ import org.dromara.productManagement.domain.bo.SjjDocumentTasksBo; import org.dromara.common.mybatis.core.page.TableDataInfo; import org.dromara.common.mybatis.core.page.PageQuery; +import java.io.IOException; import java.util.Collection; import java.util.List; @@ -47,7 +48,7 @@ public interface ISjjDocumentTasksService { * @param bo 审计局标书任务 * @return 是否新增成功 */ - Boolean insertByBo(SjjDocumentTasksBo bo); + Boolean insertByBo(SjjDocumentTasksBo bo) throws IOException; /** * 修改审计局标书任务 diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java index a4972b9..be88978 100644 --- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/service/impl/SjjDocumentTasksServiceImpl.java @@ -1,6 +1,8 @@ package org.dromara.productManagement.service.impl; import cn.dev33.satoken.stp.StpUtil; +import net.lingala.zip4j.ZipFile; +import net.lingala.zip4j.model.FileHeader; import okhttp3.*; import org.dromara.common.core.domain.model.LoginUser; import org.dromara.common.core.utils.MapstructUtils; @@ -23,12 +25,18 @@ import org.dromara.productManagement.domain.vo.SjjDocumentTasksVo; import org.dromara.productManagement.domain.SjjDocumentTasks; import org.dromara.productManagement.mapper.SjjDocumentTasksMapper; import org.dromara.productManagement.service.ISjjDocumentTasksService; +import org.dromara.productManagement.utils.PdfParserUtils; -import java.io.IOException; +import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Collection; +import java.util.zip.ZipException; /** * 审计局标书任务Service业务层处理 @@ -110,18 +118,69 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService { * @return 是否新增成功 */ @Override - public Boolean insertByBo(SjjDocumentTasksBo bo) { + public Boolean insertByBo(SjjDocumentTasksBo bo) throws IOException { SjjDocumentTasks add = MapstructUtils.convert(bo, SjjDocumentTasks.class); - String bidDocOssId = add.getBidDocOssId(); + String bidDocZipOssId = add.getBidDocZipOssId(); + String tenderDocPath=""; + String tenderDocName=""; + SysOssVo bidZipFileInfo = ossService.getById(Long.valueOf(bidDocZipOssId)); + String bidZipName = bidZipFileInfo.getOriginalName(); + String bidZipNameWithoutExt = bidZipName; + if (bidZipName.lastIndexOf(".") > 0) { + bidZipNameWithoutExt = bidZipName.substring(0, bidZipName.lastIndexOf(".")); + } + String bidZipPath = fileRootPath + bidZipFileInfo.getFileName(); String tenderDocOssId = add.getTenderDocOssId(); - SysOssVo bidFileInfo = ossService.getById(Long.valueOf(bidDocOssId)); - String bidDocName = bidFileInfo.getOriginalName(); - String bidDocPath = fileRootPath + bidFileInfo.getFileName(); - SysOssVo tenderFileInfo = ossService.getById(Long.valueOf(tenderDocOssId)); - String tenderDocName = tenderFileInfo.getOriginalName(); - String tenderDocPath = fileRootPath + tenderFileInfo.getFileName(); - add.setBidDocumentName(bidDocName); - add.setTenderDocumentName(tenderDocName); + if(StringUtils.isNotBlank(tenderDocOssId)){ + SysOssVo tenderFileInfo = ossService.getById(Long.valueOf(tenderDocOssId)); + tenderDocName = tenderFileInfo.getOriginalName(); + tenderDocPath = fileRootPath + tenderFileInfo.getFileName(); + add.setTenderDocumentName(tenderDocName); + } + add.setBidDocumentName(bidZipName); + // 创建唯一文件夹 + String uniqueFolderName = "task_" + System.currentTimeMillis() + "_" + Math.abs(bidZipName.hashCode()); + String taskFolder = tempfilePath + File.separator + uniqueFolderName; + File taskFolderDir = new File(taskFolder); + if (!taskFolderDir.exists()) { + taskFolderDir.mkdirs(); + } + + // 创建四个子文件夹 + String bidOriginalDir = taskFolder + File.separator + "bid_original"; // 投标文件解压后的原始文件 + String bidTxtDir = taskFolder + File.separator + "bid_txt"; // 投标文件解析后的TXT文件 + String tenderOriginalDir = taskFolder + File.separator + "tender_original"; // 招标文件原始文件 + String tenderTxtDir = taskFolder + File.separator + "tender_txt"; // 招标文件解析后的TXT文件 + bidOriginalDir =bidOriginalDir+ File.separator + bidZipNameWithoutExt; + + // 创建子文件夹 + new File(bidOriginalDir).mkdirs(); + new File(bidTxtDir).mkdirs(); + new File(tenderOriginalDir).mkdirs(); + new File(tenderTxtDir).mkdirs(); + // 处理投标文件压缩包 + processZipFile(bidZipPath, bidOriginalDir, bidTxtDir); + + // 复制招标文件到任务文件夹 + File tenderDoc = new File(tenderDocPath); + if (tenderDoc.exists()) { + // 复制招标文件到招标文件原始目录 + File tenderDocCopy = new File(tenderOriginalDir, tenderDocName); + try (FileInputStream fis = new FileInputStream(tenderDoc); + FileOutputStream fos = new FileOutputStream(tenderDocCopy)) { + byte[] buffer = new byte[1024]; + int length; + while ((length = fis.read(buffer)) > 0) { + fos.write(buffer, 0, length); + } + } + + // 如果是PDF文件,解析其内容到招标文件TXT目录 + if (tenderDocName.toLowerCase().endsWith(".pdf") && PdfParserUtils.isValidPdf(tenderDocCopy.getAbsolutePath())) { + processAndSavePdfContent(tenderDocCopy, tenderTxtDir, getSystemCharset()); + } + } + add.setProgressStatus("PENDING"); validEntityBeforeSave(add); boolean flag = baseMapper.insert(add) > 0; @@ -137,13 +196,10 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService { throw new IllegalArgumentException("无效的任务名称: " + add.getTaskName()); } -// Request request = new Request.Builder() -// .url(url+"?userId="+ LoginHelper.getUserId()+"&taskId="+taskId+"&filename="+filename+"&taskName="+taskName+"&priority="+priority) -// .build(); HttpUrl.Builder urlBuilder = HttpUrl.parse(chatUrl +"/back/taskStart").newBuilder(); urlBuilder.addQueryParameter("userId", String.valueOf(LoginHelper.getUserId())); urlBuilder.addQueryParameter("taskId", String.valueOf(add.getId())); - urlBuilder.addQueryParameter("filename", bidDocPath+"\n"+tenderDocPath); + urlBuilder.addQueryParameter("filename", bidOriginalDir+"\n"+tenderOriginalDir); urlBuilder.addQueryParameter("taskName", add.getTaskName()); urlBuilder.addQueryParameter("priority", "1"); Request request = new Request.Builder() @@ -166,6 +222,303 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService { return flag; } + /** + * 处理ZIP文件:解压并解析PDF + * + * @param zipFilePath ZIP文件路径 + * @param originalDir 存放解压原始文件的目录 + * @param txtDir 存放PDF解析后TXT文件的目录 + * @throws IOException 解压或解析过程中发生IO错误 + * @throws ZipException ZIP文件处理错误 + */ + private void processZipFile(String zipFilePath, String originalDir, String txtDir) throws IOException, ZipException { + // 创建解压目标目录(如果不存在) + File extractDirFile = new File(originalDir); + if (!extractDirFile.exists()) { + extractDirFile.mkdirs(); + } + + // 检测最佳编码 + Charset bestCharset = detectBestCharset(zipFilePath); + + try { + // 使用zip4j解压文件 + ZipFile zipFile = new ZipFile(zipFilePath); + zipFile.setCharset(bestCharset); + + // 获取所有文件头 + List fileHeaders = zipFile.getFileHeaders(); + for (FileHeader fileHeader : fileHeaders) { + // 跳过目录项 + if (fileHeader.isDirectory()) { + continue; + } + + try { + // 获取文件名(不包括路径) + String fileName = new File(fileHeader.getFileName()).getName(); + // 提取到指定目录,使用新的文件名 + zipFile.extractFile(fileHeader, originalDir, fileName); + } catch (Exception e) { + // 如果使用检测到的编码解压失败,使用系统默认编码重试 + try { + ZipFile fallbackZipFile = new ZipFile(zipFilePath); + fallbackZipFile.setCharset(getSystemCharset()); + fallbackZipFile.extractFile(fileHeader.getFileName(), originalDir); + } catch (Exception fallbackEx) { + System.err.println("解压文件失败: " + fileHeader.getFileName() + ", 错误: " + fallbackEx.getMessage()); + } + } + } + + // 递归处理所有PDF文件 + processAllPdfFiles(extractDirFile, txtDir, bestCharset); + } catch (Exception e) { + // 如果使用检测的编码失败,尝试直接整体解压 + try { + ZipFile zipFile = new ZipFile(zipFilePath); + zipFile.setCharset(getSystemCharset()); + zipFile.extractAll(originalDir); + processAllPdfFiles(extractDirFile, txtDir, getSystemCharset()); + } catch (Exception e2) { + System.err.println("解压失败: " + e2.getMessage()); + throw new IOException("解压失败", e2); + } + } + } + + /** + * 检测ZIP文件的最佳字符编码 + * 通过对比不同编码下的文件名可读性来确定最佳编码 + */ + private Charset detectBestCharset(String zipFilePath) { + // 常用的中文编码 + Charset[] charsets = { + Charset.forName("GB18030"), // 首选,覆盖面最广的中文编码 + Charset.forName("GBK"), // 次选,常用中文编码 + StandardCharsets.UTF_8, // 通用编码 + getSystemCharset() // 系统默认编码 + }; + + int bestScore = -1; + Charset bestCharset = getSystemCharset(); // 默认使用系统字符集 + + try { + // 尝试每种编码并评分 + for (Charset charset : charsets) { + int score = evaluateCharsetForZip(zipFilePath, charset); + if (score > bestScore) { + bestScore = score; + bestCharset = charset; + } + } + } catch (Exception e) { + System.err.println("检测字符集时出错: " + e.getMessage()); + } + + System.out.println("为ZIP文件选择的最佳字符集: " + bestCharset.name()); + return bestCharset; + } + + /** + * 评估特定字符集对ZIP文件的适用性 + * 返回评分值,分数越高表示编码越适合 + */ + private int evaluateCharsetForZip(String zipFilePath, Charset charset) { + int score = 0; + + try { + ZipFile zipFile = new ZipFile(zipFilePath); + zipFile.setCharset(charset); + + List fileHeaders = zipFile.getFileHeaders(); + for (FileHeader fileHeader : fileHeaders) { + if (fileHeader.isDirectory()) continue; + + String fileName = fileHeader.getFileName(); + score += evaluateString(fileName, charset); + } + } catch (Exception e) { + // 如果使用此编码打开ZIP失败,得分为-1 + return -1; + } + + return score; + } + + /** + * 评估字符串在特定编码下的可读性 + * 检查是否包含乱码字符,返回可读性得分 + */ + private int evaluateString(String str, Charset charset) { + int score = 0; + + try { + // 将字符串转换为字节,然后再转回来,检查是否有信息丢失 + byte[] bytes = str.getBytes(charset); + String decoded = new String(bytes, charset); + + if (str.equals(decoded)) { + score += 10; // 完全匹配加10分 + } + + // 检查是否包含常见乱码字符 + score -= countCharactersInRange(str, 0xFFFD, 0xFFFD); // Unicode替换字符 + score -= countCharactersInRange(str, 0xD800, 0xDFFF) * 2; // Unicode代理区域 + + // 检查特殊字符比例 + int specialChars = countSpecialCharacters(str); + if (specialChars > str.length() / 3) { + score -= 5; // 特殊字符过多,扣分 + } + + // 检查中文字符的存在 + int chineseChars = countChineseCharacters(str); + if (chineseChars > 0) { + score += 5; // 包含中文字符加分 + } + } catch (Exception e) { + score -= 10; // 转换异常,大幅扣分 + } + + return score; + } + + /** + * 计算字符串中特定Unicode范围内的字符数量 + */ + private int countCharactersInRange(String str, int start, int end) { + int count = 0; + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (c >= start && c <= end) { + count++; + } + } + return count; + } + + /** + * 计算字符串中特殊字符(非字母、数字、常用标点)的数量 + */ + private int countSpecialCharacters(String str) { + int count = 0; + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (!Character.isLetterOrDigit(c) && !isCommonPunctuation(c)) { + count++; + } + } + return count; + } + + /** + * 判断字符是否为常用标点符号 + */ + private boolean isCommonPunctuation(char c) { + return c == '.' || c == ',' || c == ';' || c == ':' || c == '!' || + c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || + c == '{' || c == '}' || c == '_' || c == '-' || c == ' ' || + c == '/' || c == '\\'; + } + + /** + * 计算字符串中中文字符的数量 + */ + private int countChineseCharacters(String str) { + int count = 0; + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + if (isChinese(c)) { + count++; + } + } + return count; + } + + /** + * 判断字符是否为中文字符 + */ + private boolean isChinese(char c) { + Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); + return ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A; + } + + /** + * 递归处理目录中的所有PDF文件 + * + * @param directory 需要处理的目录 + * @param txtOutputDir PDF解析后的TXT文件输出目录 + * @param charset 字符集 + */ + private void processAllPdfFiles(File directory, String txtOutputDir, Charset charset) { + if (!directory.isDirectory()) { + return; + } + + File[] files = directory.listFiles(); + if (files == null) { + return; + } + + for (File file : files) { + if (file.isDirectory()) { + // 递归处理子目录 + processAllPdfFiles(file, txtOutputDir, charset); + } else if (file.getName().toLowerCase().endsWith(".pdf") && PdfParserUtils.isValidPdf(file.getAbsolutePath())) { + // 处理PDF文件 + processAndSavePdfContent(file, txtOutputDir, charset); + } + } + } + + /** + * 获取系统默认字符集 + * + * @return 适合当前操作系统的字符集 + */ + private Charset getSystemCharset() { + String osName = System.getProperty("os.name").toLowerCase(); + return osName.contains("win") ? Charset.forName("GBK") : StandardCharsets.UTF_8; + } + + /** + * 处理PDF文件并保存为TXT + * + * @param pdfFile PDF文件 + * @param outputDir 输出目录 + * @param charset 字符集 + */ + private void processAndSavePdfContent(File pdfFile, String outputDir, Charset charset) { + try { + + // 提取PDF段落 + List paragraphs = PdfParserUtils.extractParagraphs(pdfFile.getAbsolutePath()); + + if (paragraphs.isEmpty()) { + return; + } + + // 创建TXT文件名(替换扩展名) + String txtFileName = pdfFile.getName().replaceAll("\\.pdf$", ".txt"); + File txtFile = new File(outputDir, txtFileName); + + // 写入TXT文件 + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(txtFile), charset))) { + + for (String paragraph : paragraphs) { + writer.write(paragraph); + writer.newLine(); + writer.newLine(); // 段落间添加空行 + } + } + + } catch (Exception e) { + } + } /** * 修改审计局标书任务 * @@ -205,8 +558,14 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService { public Boolean ossRemoveById(List ids, Boolean b) { List sjjDocumentTasksVos = baseMapper.selectVoByIds(ids); for (SjjDocumentTasksVo sjjDocumentTasksVo : sjjDocumentTasksVos) { - ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getTenderDocOssId())), true); - ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getBidDocOssId())), true); + if (sjjDocumentTasksVo.getTenderDocOssId() != null ) { + ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getTenderDocOssId())), true); + + } + if (sjjDocumentTasksVo.getBidDocZipOssId() != null ) { + ossService.deleteWithValidByIds(Collections.singletonList(Long.valueOf(sjjDocumentTasksVo.getBidDocZipOssId())), true); + + } SjjDocumentTasks convert = MapstructUtils.convert(sjjDocumentTasksVo, SjjDocumentTasks.class); convert.setDeleteFlag("Y"); baseMapper.updateById(convert); @@ -214,3 +573,4 @@ public class SjjDocumentTasksServiceImpl implements ISjjDocumentTasksService { return true; } } + diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java index 274b7fa..47184b1 100644 --- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java @@ -11,7 +11,10 @@ import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.*; +import java.util.concurrent.*; +import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; +import java.util.stream.IntStream; /** * PDF解析工具类 @@ -29,6 +32,9 @@ import java.util.stream.Collectors; public class PdfParserUtils { private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class); + + // 是否打印警告日志 + private static boolean enableWarningLogs = false; // 默认段落最小字数阈值 private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20; @@ -36,7 +42,18 @@ public class PdfParserUtils { private static final float DEFAULT_MAX_INDENT_X = 100f; // 默认容差范围 private static final float DEFAULT_TOLERANCE = 2f; + // 默认线程池大小 + private static final int DEFAULT_THREAD_POOL_SIZE = 32; + /** + * 设置是否启用警告日志 + * + * @param enable 是否启用 + */ + public static void setEnableWarningLogs(boolean enable) { + enableWarningLogs = enable; + } + /** * 从PDF文件中提取段落 * @@ -57,6 +74,20 @@ public class PdfParserUtils { * @return 提取的段落列表 */ public static List extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) { + return extractParagraphs(filePath, minParagraphLength, maxIndentX, tolerance, DEFAULT_THREAD_POOL_SIZE); + } + + /** + * 从PDF文件中提取段落,支持自定义参数和线程池大小 + * + * @param filePath PDF文件路径 + * @param minParagraphLength 最小段落长度 + * @param maxIndentX 最大缩进值 + * @param tolerance 容差范围 + * @param threadPoolSize 线程池大小 + * @return 提取的段落列表 + */ + public static List extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance, int threadPoolSize) { List paragraphs = new ArrayList<>(); File pdfFile = new File(filePath); @@ -65,73 +96,102 @@ public class PdfParserUtils { return paragraphs; } + ExecutorService executor = null; try { // 设置PDFBox选项,抑制字体警告 System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true"); - + // 关闭PDFBox内部的警告日志 + java.util.logging.Logger.getLogger("org.apache.pdfbox").setLevel(java.util.logging.Level.SEVERE); + // 关闭FontBox相关的警告 + java.util.logging.Logger.getLogger("org.apache.fontbox").setLevel(java.util.logging.Level.SEVERE); + // 禁用PDFBox字体警告的另一种方式 + System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); + // 设置PDF处理相关配置 + System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); + // 忽略字体缺失警告 + System.setProperty("org.apache.pdfbox.fontcache", "none"); + // 打开PDF文档 PDDocument document = PDDocument.load(pdfFile); - + // 设置PDFBox参数,提高对中文字体的兼容性 document.setResourceCache(null); // 禁用资源缓存,可能减少某些字体问题 - + int totalPages = document.getNumberOfPages(); + // 创建线程池 + executor = Executors.newFixedThreadPool(Math.min(threadPoolSize, totalPages)); + // 第一步:收集所有x坐标和重复文本 - List xCoordinates = new ArrayList<>(); - Map textFrequency = new HashMap<>(); // 记录文本出现频率 - - log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages); - - // 遍历每一页收集X坐标 - for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { - try { - // 为每页创建文本提取器 - TextBlockStripper stripper = new TextBlockStripper(); - stripper.setSortByPosition(true); - stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失 - stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性 - stripper.setStartPage(pageIndex + 1); - stripper.setEndPage(pageIndex + 1); - stripper.getText(document); - - for (TextBlock block : stripper.getTextBlocks()) { - String text = block.getText().trim(); - if (text.length() > 0) { - // 统计X坐标 - if (block.getX() < maxIndentX) { - xCoordinates.add(block.getX()); - } - - // 统计文本频率 - if (text.length() >= minParagraphLength) { - textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); + ConcurrentLinkedQueue xCoordinates = new ConcurrentLinkedQueue<>(); + ConcurrentLinkedQueue endXCoordinates = new ConcurrentLinkedQueue<>(); + ConcurrentHashMap textFrequency = new ConcurrentHashMap<>(); // 记录文本出现频率 + + log.info("开始多线程解析PDF文件: {}, 总页数: {}, 线程池大小: {}", filePath, totalPages, threadPoolSize); + + // 创建任务列表,每个任务处理一页 + List> tasks = IntStream.range(0, totalPages) + .mapToObj(pageIndex -> (Callable) () -> { + try { + // 为每页创建文本提取器 + TextBlockStripper stripper = new TextBlockStripper(); + stripper.setSortByPosition(true); + stripper.setSuppressDuplicateOverlappingText(false); + stripper.setAddMoreFormatting(false); + stripper.setStartPage(pageIndex + 1); + stripper.setEndPage(pageIndex + 1); + stripper.getText(document); + + for (TextBlock block : stripper.getTextBlocks()) { + String text = block.getText().trim(); + if (text.length() > 0) { + // 统计X坐标 + if (block.getX() < maxIndentX) { + xCoordinates.add(block.getX()); + } + // 统计终点X坐标 + endXCoordinates.add(block.getEndX()); + + // 统计文本频率(线程安全方式) + if (text.length() >= minParagraphLength) { + textFrequency.computeIfAbsent(text, k -> new AtomicInteger(0)).incrementAndGet(); + } } } + } catch (Exception e) { + if (enableWarningLogs) { + log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage()); + } } - } catch (Exception e) { - log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage()); - // 继续处理下一页,而不是中断整个过程 - } - } + return null; + }) + .collect(Collectors.toList()); - if (xCoordinates.isEmpty()) { - log.warn("未找到有效的X坐标,无法提取段落"); + // 提交所有任务并等待完成 + executor.invokeAll(tasks); + + if (xCoordinates.isEmpty() || endXCoordinates.isEmpty()) { + if (enableWarningLogs) { + log.warn("未找到有效的X坐标,无法提取段落"); + } document.close(); + if (executor != null) { + executor.shutdown(); + } return paragraphs; } // 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容) int frequencyThreshold = totalPages / 2; Set frequentTexts = textFrequency.entrySet().stream() - .filter(entry -> entry.getValue() > frequencyThreshold) + .filter(entry -> entry.getValue().get() > frequencyThreshold) .map(Map.Entry::getKey) .collect(Collectors.toSet()); log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold); // 统计x坐标频率并找出前两名(通常是正常段落和首行缩进) - Map xCounter = xCoordinates.stream() + Map xCounter = new ArrayList<>(xCoordinates).stream() .collect(Collectors.groupingBy(x -> x, Collectors.counting())); List> mostCommonX = xCounter.entrySet().stream() @@ -139,120 +199,206 @@ public class PdfParserUtils { .limit(2) .collect(Collectors.toList()); - if (mostCommonX.size() < 2) { - log.warn("未找到足够的X坐标特征,无法区分段落缩进"); + // 统计终点x坐标频率并找出最常见的值 + Map endXCounter = new ArrayList<>(endXCoordinates).stream() + .collect(Collectors.groupingBy(x -> x, Collectors.counting())); + + List> mostCommonEndX = endXCounter.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(1) + .collect(Collectors.toList()); + + if (mostCommonX.size() < 2 || mostCommonEndX.isEmpty()) { + if (enableWarningLogs) { + log.warn("未找到足够的X坐标特征,无法区分段落缩进"); + } document.close(); + if (executor != null) { + executor.shutdown(); + } return paragraphs; } // 确保x_indent > x_normal float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标 float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标 + float commonEndX = mostCommonEndX.get(0).getKey(); // 最常见的终点x坐标 - log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance); - - // 根据基准x坐标提取段落 - List currentParagraph = new ArrayList<>(); - - // 逐页处理文本块 - for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { - try { - List pageTextBlocks = new ArrayList<>(); - - // 为每页创建文本提取器 - TextBlockStripper stripper = new TextBlockStripper(); - stripper.setSortByPosition(true); - stripper.setSuppressDuplicateOverlappingText(false); - stripper.setAddMoreFormatting(false); - stripper.setStartPage(pageIndex + 1); - stripper.setEndPage(pageIndex + 1); - stripper.getText(document); - - // 获取当前页的文本块并排序 - pageTextBlocks.addAll(stripper.getTextBlocks()); - pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); - - // 处理当前页的文本块 - for (TextBlock block : pageTextBlocks) { - String lineText = block.getText().trim().replace('\n', ' ').trim(); - if (lineText.isEmpty()) { - continue; - } - - // 过滤高频文本 - if (frequentTexts.contains(lineText)) { - continue; - } - - float currentX = block.getX(); - - // 判断当前x坐标属于哪种类型 - boolean isIndent = Math.abs(currentX - xIndent) <= tolerance; - boolean isNormal = Math.abs(currentX - xNormal) <= tolerance; - - // 如果是缩进位置,说明是新段落的开始 - if (isIndent) { - if (!currentParagraph.isEmpty()) { - String paragraphText = String.join("", currentParagraph); - if (paragraphText.length() >= minParagraphLength) { - paragraphs.add(paragraphText.trim()); - } - currentParagraph.clear(); + log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}, commonEndX={}", xNormal, xIndent, tolerance, commonEndX); + + // 使用ConcurrentMap存储每页的段落 + ConcurrentHashMap> pageParagraphs = new ConcurrentHashMap<>(); + + // 创建段落提取任务 + List> paragraphTasks = IntStream.range(0, totalPages) + .mapToObj(pageIndex -> (Callable) () -> { + try { + List pageResult = new ArrayList<>(); + List pageTextBlocks = new ArrayList<>(); + + // 为每页创建文本提取器 + TextBlockStripper stripper = new TextBlockStripper(); + stripper.setSortByPosition(true); + stripper.setSuppressDuplicateOverlappingText(false); + stripper.setAddMoreFormatting(false); + stripper.setStartPage(pageIndex + 1); + stripper.setEndPage(pageIndex + 1); + stripper.getText(document); + + // 获取当前页的文本块并排序 + pageTextBlocks.addAll(stripper.getTextBlocks()); + pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); + + // 每页独立处理段落 + List currentParagraph = new ArrayList<>(); + boolean isInParagraph = false; + + // 处理当前页的文本块 + for (int i = 0; i < pageTextBlocks.size(); i++) { + TextBlock block = pageTextBlocks.get(i); + String lineText = block.getText().trim().replace('\n', ' ').trim(); + if (lineText.isEmpty()) { + continue; } - if (lineText.length() >= minParagraphLength) { - currentParagraph.add(lineText); + + // 过滤高频文本 + if (frequentTexts.contains(lineText)) { + continue; } - } - // 如果是正常位置,追加到当前段落 - else if (isNormal) { - if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落 - currentParagraph.add(lineText); - } else { - currentParagraph.add(lineText); + + float currentX = block.getX(); + float currentEndX = block.getEndX(); + + // 判断当前x坐标属于哪种类型 + boolean isIndent = Math.abs(currentX - xIndent) <= tolerance; + boolean isNormal = Math.abs(currentX - xNormal) <= tolerance; + + // 如果是缩进位置,说明是新段落的开始 + if (isIndent) { + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= minParagraphLength) { + pageResult.add(paragraphText.trim()); + } + currentParagraph.clear(); + } + if (lineText.length() >= minParagraphLength) { + currentParagraph.add(lineText); + isInParagraph = true; + } } - } - // 如果既不是缩进也不是正常位置,作为独立段落 - else { - // 如果独立段落字数满足要求进行统计,不满足要求跳过 - if (lineText.length() >= minParagraphLength) { + // 如果是正常位置 + else if (isNormal) { + if (!isInParagraph) { + // 检查是否是段落的开始(终点x在最大值范围内) + if (currentEndX <= commonEndX * 0.95) { // 使用95%作为阈值 + isInParagraph = true; + currentParagraph.add(lineText); + } + } else { + // 检查是否应该结束当前段落 + if (currentEndX > commonEndX * 0.95) { + // 当前行结束,检查下一行 + if (i + 1 < pageTextBlocks.size()) { + TextBlock nextBlock = pageTextBlocks.get(i + 1); + float nextX = nextBlock.getX(); + boolean nextIsNormal = Math.abs(nextX - xNormal) <= tolerance; + + if (!nextIsNormal) { + // 下一行不是正常位置,结束当前段落 + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= minParagraphLength) { + pageResult.add(paragraphText.trim()); + } + currentParagraph.clear(); + isInParagraph = false; + } else { + // 下一行是正常位置,继续当前段落 + currentParagraph.add(lineText); + } + } else { + // 已经是最后一行,结束当前段落 + currentParagraph.add(lineText); + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= minParagraphLength) { + pageResult.add(paragraphText.trim()); + } + currentParagraph.clear(); + isInParagraph = false; + } + } else { + // 继续当前段落 + currentParagraph.add(lineText); + } + } + } + // 如果既不是缩进也不是正常位置,作为独立段落 + else { if (!currentParagraph.isEmpty()) { String paragraphText = String.join("", currentParagraph); if (paragraphText.length() >= minParagraphLength) { - paragraphs.add(paragraphText.trim()); + pageResult.add(paragraphText.trim()); } currentParagraph.clear(); + isInParagraph = false; + } + if (lineText.length() >= minParagraphLength) { + pageResult.add(lineText.trim()); } - paragraphs.add(lineText.trim()); } } + + // 处理最后一个段落 + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= minParagraphLength) { + pageResult.add(paragraphText.trim()); + } + } + + // 保存当前页的结果 + pageParagraphs.put(pageIndex, pageResult); + + } catch (Exception e) { + if (enableWarningLogs) { + log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage()); + } } - } catch (Exception e) { - log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage()); - // 继续处理下一页 - } - } - - // 处理最后一个段落 - if (!currentParagraph.isEmpty()) { - String paragraphText = String.join("", currentParagraph); - if (paragraphText.length() >= minParagraphLength) { - paragraphs.add(paragraphText.trim()); + return null; + }) + .collect(Collectors.toList()); + + // 提交所有段落提取任务并等待完成 + executor.invokeAll(paragraphTasks); + + // 按页码顺序合并所有段落 + for (int i = 0; i < totalPages; i++) { + List pageParagraphList = pageParagraphs.get(i); + if (pageParagraphList != null) { + paragraphs.addAll(pageParagraphList); } } - + document.close(); - log.info("PDF解析完成,提取段落数: {}", paragraphs.size()); - + log.info("PDF多线程解析完成,提取段落数: {}", paragraphs.size()); + } catch (IOException e) { log.error("提取PDF段落失败: {}", e.getMessage(), e); + } catch (InterruptedException e) { + log.error("多线程处理被中断: {}", e.getMessage(), e); + Thread.currentThread().interrupt(); + } finally { + if (executor != null) { + executor.shutdown(); + } } - + return paragraphs; } /** * 检查PDF文件是否可被有效解析 - * + * * @param filePath PDF文件路径 * @return 是否可解析 */ @@ -261,14 +407,16 @@ public class PdfParserUtils { if (!pdfFile.exists() || !pdfFile.isFile()) { return false; } - + try { PDDocument document = PDDocument.load(pdfFile); int pageCount = document.getNumberOfPages(); document.close(); return pageCount > 0; } catch (Exception e) { - log.error("检查PDF有效性时出错: {}", e.getMessage()); + if (enableWarningLogs) { + log.error("检查PDF有效性时出错: {}", e.getMessage()); + } return false; } } @@ -281,7 +429,7 @@ public class PdfParserUtils { private float lastY = -1; private String currentLine = ""; private float currentX = 0; - + public TextBlockStripper() throws IOException { super(); // 初始化 @@ -290,20 +438,20 @@ public class PdfParserUtils { currentLine = ""; currentX = 0; } - + @Override protected void processTextPosition(TextPosition text) { try { float textX = text.getXDirAdj(); float textY = text.getYDirAdj(); - + // 如果Y坐标变化超过一定阈值,认为是新行 if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { // 保存上一行 if (!currentLine.trim().isEmpty()) { - textBlocks.add(new TextBlock(currentX, lastY, currentLine)); + textBlocks.add(new TextBlock(currentX, textX, lastY, currentLine)); } - + // 开始新行 currentLine = text.getUnicode(); currentX = textX; @@ -314,12 +462,14 @@ public class PdfParserUtils { } } catch (Exception e) { // 忽略单个字符处理错误,继续处理其他字符 - log.debug("处理文本位置时出错: {}", e.getMessage()); + if (enableWarningLogs) { + log.debug("处理文本位置时出错: {}", e.getMessage()); + } } - + super.processTextPosition(text); } - + @Override protected void startPage(PDPage page) throws IOException { // 清空textBlocks列表,避免累积所有页面的内容 @@ -329,16 +479,16 @@ public class PdfParserUtils { currentX = 0; super.startPage(page); } - + @Override public void endDocument(PDDocument document) throws IOException { // 保存最后一行 if (!currentLine.trim().isEmpty()) { - textBlocks.add(new TextBlock(currentX, lastY, currentLine)); + textBlocks.add(new TextBlock(currentX, currentX, lastY, currentLine)); } super.endDocument(document); } - + public List getTextBlocks() { return textBlocks; } @@ -349,23 +499,29 @@ public class PdfParserUtils { */ private static class TextBlock { private final float x; + private final float endX; private final float y; private final String text; - - public TextBlock(float x, float y, String text) { + + public TextBlock(float x, float endX, float y, String text) { this.x = x; + this.endX = endX; this.y = y; this.text = text; } - + public float getX() { return x; } - + + public float getEndX() { + return endX; + } + public float getY() { return y; } - + public String getText() { return text; }