From e3d69d597467c94d839dfc19ad63345e4919c16a Mon Sep 17 00:00:00 2001 From: zhouhaibin Date: Sat, 3 May 2025 17:46:48 +0800 Subject: [PATCH] =?UTF-8?q?sjj=20=E5=8A=9F=E8=83=BD=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E4=B8=8E=E4=BC=98=E5=8C=963?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ruoyi-admin/pom.xml | 8 + .../org/dromara/test/PdfExtractorTest.java | 332 ++++++++++++++++ .../utils/PdfParserUtils.java | 373 ++++++++++++++++++ 3 files changed, 713 insertions(+) create mode 100644 ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java create mode 100644 zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java diff --git a/ruoyi-admin/pom.xml b/ruoyi-admin/pom.xml index 9c26130..f3592cb 100644 --- a/ruoyi-admin/pom.xml +++ b/ruoyi-admin/pom.xml @@ -197,6 +197,14 @@ + + + org.apache.pdfbox + pdfbox + 2.0.27 + test + + diff --git a/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java b/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java new file mode 100644 index 0000000..cccd676 --- /dev/null +++ b/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java @@ -0,0 +1,332 @@ +package org.dromara.test; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.logging.Logger; +import java.util.stream.Collectors; + +/** + * PDF段落提取测试 + */ +@DisplayName("PDF段落提取测试") +public class PdfExtractorTest { + + private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName()); + + // 段落最小字数阈值 + private static final int MIN_PARAGRAPH_LENGTH = 20; + // 最大缩进值 + private static final float MAX_INDENT_X = 100f; + // 容差范围 + private static final float TOLERANCE = 2f; + + @Test + @DisplayName("测试PDF段落提取") + public void testExtractParagraphs() { + String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径 + List paragraphs = extractParagraphsFromPdf(pdfPath); + + System.out.println("提取段落总数: " + paragraphs.size()); + for (int i = 0; i < paragraphs.size(); i++) { + if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){ + System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim()); + } + } + } + + /** + * 从PDF文件中提取段落,基于x坐标统计来判断段落 + */ + public List extractParagraphsFromPdf(String filePath) { + List paragraphs = new ArrayList<>(); + File pdfFile = new File(filePath); + + try { + // 打开PDF文档 + PDDocument document = PDDocument.load(pdfFile); + int totalPages = document.getNumberOfPages(); + + // 第一步:收集所有x坐标和重复文本 + List xCoordinates = new ArrayList<>(); + Map textFrequency = new HashMap<>(); // 记录文本出现频率 + + // 遍历每一页收集X坐标 + for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { + final int currentPage = pageIndex; // 用于匿名类中引用 + + // 为每页创建文本提取器 + TextBlockStripper stripper = new TextBlockStripper(); + stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失 + stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性 + stripper.setStartPage(pageIndex + 1); + stripper.setEndPage(pageIndex + 1); + stripper.setSortByPosition(true); + stripper.getText(document); + + for (TextBlock block : stripper.getTextBlocks()) { + String text = block.getText().trim(); + if (text.length() > 0) { + // 统计X坐标 + if (block.getX() < MAX_INDENT_X) { + xCoordinates.add(block.getX()); + } + + // 统计文本频率 + if (text.length() >= MIN_PARAGRAPH_LENGTH) { + textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); + } + } + } + } + + if (xCoordinates.isEmpty()) { + document.close(); + return paragraphs; + } + + // 找出频率超过页面数一半的文本 + int frequencyThreshold = totalPages / 2; + Set frequentTexts = textFrequency.entrySet().stream() + .filter(entry -> entry.getValue() > frequencyThreshold) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + + System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)"); + frequentTexts.forEach(text -> System.out.println("高频文本: " + + (text.length() > 50 ? text.substring(0, 47) + "..." : text) + + " 出现次数: " + textFrequency.get(text))); + + // 第二步:统计x坐标频率并找出前两名 + Map xCounter = xCoordinates.stream() + .collect(Collectors.groupingBy(x -> x, Collectors.counting())); + + List> mostCommonX = xCounter.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(2) + .collect(Collectors.toList()); + + if (mostCommonX.size() < 2) { + document.close(); + return paragraphs; + } + + // 确保x_indent > x_normal + float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标 + float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标 + + System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE); + + // 第三步:根据基准x坐标提取段落 + List currentParagraph = new ArrayList<>(); + int num=311; + + // 逐页处理文本块 + for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { + List pageTextBlocks = new ArrayList<>(); + + // 为每页创建文本提取器 + TextBlockStripper stripper = new TextBlockStripper(); + stripper.setSortByPosition(true); + stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失 + stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性 + stripper.setStartPage(pageIndex + 1); + stripper.setEndPage(pageIndex + 1); + stripper.getText(document); + + // 获取当前页的文本块并排序 + pageTextBlocks.addAll(stripper.getTextBlocks()); + pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); + + if(pageIndex==num){ + System.out.println(pageTextBlocks); + } + + // 处理当前页的文本块 + for (TextBlock block : pageTextBlocks) { + String lineText = block.getText().trim().replace('\n', ' ').trim(); + if (lineText.isEmpty()) { + continue; + } + + // 过滤高频文本 + if (frequentTexts.contains(lineText)) { + if (pageIndex == num) { + System.out.println("过滤高频文本: " + + (lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText)); + } + continue; + } + + float currentX = block.getX(); + + // 判断当前x坐标属于哪种类型 + boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE; + boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE; + + // 如果是缩进位置,说明是新段落的开始 + if (isIndent) { + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { + paragraphs.add(paragraphText); + } + currentParagraph.clear(); + } + if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { + currentParagraph.add(lineText); + } + } + // 如果是正常位置,追加到当前段落 + else if (isNormal) { + if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落 + currentParagraph.add(lineText); + } else { + currentParagraph.add(lineText); + } + } + // 如果既不是缩进也不是正常位置,作为独立段落 + else { + // 如果独立段落字数满足要求进行统计,不满足要求跳过 + if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { + paragraphs.add(paragraphText); + } + currentParagraph.clear(); + } + paragraphs.add(lineText); + } + } + } + } + + // 处理最后一个段落 + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { + paragraphs.add(paragraphText); + } + } + + document.close(); + + } catch (IOException e) { + logger.severe("提取PDF段落失败: " + e.getMessage()); + e.printStackTrace(); + } + + return paragraphs; + } + + /** + * 用于提取文本块的PDFTextStripper + */ + private static class TextBlockStripper extends PDFTextStripper { + private final List textBlocks = new ArrayList<>(); + private float lastY = -1; + private String currentLine = ""; + private float currentX = 0; + + public TextBlockStripper() throws IOException { + super(); + // 初始化 + textBlocks.clear(); + lastY = -1; + currentLine = ""; + currentX = 0; + } + + @Override + protected void processTextPosition(TextPosition text) { + float textX = text.getXDirAdj(); + float textY = text.getYDirAdj(); + float endX = text.getEndX(); + float endY = text.getEndY(); + // 如果Y坐标变化超过一定阈值,认为是新行 + if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { + // 保存上一行 + if (!currentLine.trim().isEmpty()) { + textBlocks.add(new TextBlock(currentX, lastY, currentLine)); + } + + // 开始新行 + currentLine = text.getUnicode(); + currentX = textX; + lastY = textY; + } else { + // 在同一行,追加文本 + currentLine += text.getUnicode(); + } + + super.processTextPosition(text); + } + + @Override + protected void startPage(PDPage page) throws IOException { + // 清空textBlocks列表,避免累积所有页面的内容 + textBlocks.clear(); + lastY = -1; + currentLine = ""; + currentX = 0; + super.startPage(page); + } + + @Override + public void endDocument(PDDocument document) throws IOException { + // 保存最后一行 + if (!currentLine.trim().isEmpty()) { + textBlocks.add(new TextBlock(currentX, lastY, currentLine)); + } + super.endDocument(document); + } + + public List getTextBlocks() { + return textBlocks; + } + } + + /** + * 用于存储文本块信息的类 + */ + private static class TextBlock { + private final float x; + private final float y; + private final String text; + + public TextBlock(float x, float y, String text) { + this.x = x; + this.y = y; + this.text = text; + } + + public float getX() { + return x; + } + + public float getY() { + return y; + } + + public String getText() { + return text; + } + + @Override + public String toString() { + return "TextBlock{" + + "x=" + x + + ", y=" + y + + ", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' + + '}'; + } + } +} diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java new file mode 100644 index 0000000..274b7fa --- /dev/null +++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java @@ -0,0 +1,373 @@ +package org.dromara.productManagement.utils; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.rendering.ImageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +/** + * PDF解析工具类 + * 用于提取PDF文档中的段落,支持首行缩进识别和高频文本过滤 + * // 使用默认参数 + * List paragraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf"); + * + * // 使用自定义参数 + * List customParagraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf", + * 30, // 最小段落长度 + * 120, // 最大缩进值 + * 3.0f // 容差范围 + * ); + */ +public class PdfParserUtils { + + private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class); + + // 默认段落最小字数阈值 + private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20; + // 默认最大缩进值 + private static final float DEFAULT_MAX_INDENT_X = 100f; + // 默认容差范围 + private static final float DEFAULT_TOLERANCE = 2f; + + /** + * 从PDF文件中提取段落 + * + * @param filePath PDF文件路径 + * @return 提取的段落列表 + */ + public static List extractParagraphs(String filePath) { + return extractParagraphs(filePath, DEFAULT_MIN_PARAGRAPH_LENGTH, DEFAULT_MAX_INDENT_X, DEFAULT_TOLERANCE); + } + + /** + * 从PDF文件中提取段落,支持自定义参数 + * + * @param filePath PDF文件路径 + * @param minParagraphLength 最小段落长度 + * @param maxIndentX 最大缩进值 + * @param tolerance 容差范围 + * @return 提取的段落列表 + */ + public static List extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) { + List paragraphs = new ArrayList<>(); + File pdfFile = new File(filePath); + + if (!pdfFile.exists() || !pdfFile.isFile()) { + log.error("PDF文件不存在: {}", filePath); + return paragraphs; + } + + try { + // 设置PDFBox选项,抑制字体警告 + System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true"); + + // 打开PDF文档 + PDDocument document = PDDocument.load(pdfFile); + + // 设置PDFBox参数,提高对中文字体的兼容性 + document.setResourceCache(null); // 禁用资源缓存,可能减少某些字体问题 + + int totalPages = document.getNumberOfPages(); + + // 第一步:收集所有x坐标和重复文本 + List xCoordinates = new ArrayList<>(); + Map textFrequency = new HashMap<>(); // 记录文本出现频率 + + log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages); + + // 遍历每一页收集X坐标 + for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { + try { + // 为每页创建文本提取器 + TextBlockStripper stripper = new TextBlockStripper(); + stripper.setSortByPosition(true); + stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失 + stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性 + stripper.setStartPage(pageIndex + 1); + stripper.setEndPage(pageIndex + 1); + stripper.getText(document); + + for (TextBlock block : stripper.getTextBlocks()) { + String text = block.getText().trim(); + if (text.length() > 0) { + // 统计X坐标 + if (block.getX() < maxIndentX) { + xCoordinates.add(block.getX()); + } + + // 统计文本频率 + if (text.length() >= minParagraphLength) { + textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); + } + } + } + } catch (Exception e) { + log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage()); + // 继续处理下一页,而不是中断整个过程 + } + } + + if (xCoordinates.isEmpty()) { + log.warn("未找到有效的X坐标,无法提取段落"); + document.close(); + return paragraphs; + } + + // 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容) + int frequencyThreshold = totalPages / 2; + Set frequentTexts = textFrequency.entrySet().stream() + .filter(entry -> entry.getValue() > frequencyThreshold) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + + log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold); + + // 统计x坐标频率并找出前两名(通常是正常段落和首行缩进) + Map xCounter = xCoordinates.stream() + .collect(Collectors.groupingBy(x -> x, Collectors.counting())); + + List> mostCommonX = xCounter.entrySet().stream() + .sorted(Map.Entry.comparingByValue().reversed()) + .limit(2) + .collect(Collectors.toList()); + + if (mostCommonX.size() < 2) { + log.warn("未找到足够的X坐标特征,无法区分段落缩进"); + document.close(); + return paragraphs; + } + + // 确保x_indent > x_normal + float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标 + float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标 + + log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance); + + // 根据基准x坐标提取段落 + List currentParagraph = new ArrayList<>(); + + // 逐页处理文本块 + for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { + try { + List pageTextBlocks = new ArrayList<>(); + + // 为每页创建文本提取器 + TextBlockStripper stripper = new TextBlockStripper(); + stripper.setSortByPosition(true); + stripper.setSuppressDuplicateOverlappingText(false); + stripper.setAddMoreFormatting(false); + stripper.setStartPage(pageIndex + 1); + stripper.setEndPage(pageIndex + 1); + stripper.getText(document); + + // 获取当前页的文本块并排序 + pageTextBlocks.addAll(stripper.getTextBlocks()); + pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); + + // 处理当前页的文本块 + for (TextBlock block : pageTextBlocks) { + String lineText = block.getText().trim().replace('\n', ' ').trim(); + if (lineText.isEmpty()) { + continue; + } + + // 过滤高频文本 + if (frequentTexts.contains(lineText)) { + continue; + } + + float currentX = block.getX(); + + // 判断当前x坐标属于哪种类型 + boolean isIndent = Math.abs(currentX - xIndent) <= tolerance; + boolean isNormal = Math.abs(currentX - xNormal) <= tolerance; + + // 如果是缩进位置,说明是新段落的开始 + if (isIndent) { + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= minParagraphLength) { + paragraphs.add(paragraphText.trim()); + } + currentParagraph.clear(); + } + if (lineText.length() >= minParagraphLength) { + currentParagraph.add(lineText); + } + } + // 如果是正常位置,追加到当前段落 + else if (isNormal) { + if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落 + currentParagraph.add(lineText); + } else { + currentParagraph.add(lineText); + } + } + // 如果既不是缩进也不是正常位置,作为独立段落 + else { + // 如果独立段落字数满足要求进行统计,不满足要求跳过 + if (lineText.length() >= minParagraphLength) { + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= minParagraphLength) { + paragraphs.add(paragraphText.trim()); + } + currentParagraph.clear(); + } + paragraphs.add(lineText.trim()); + } + } + } + } catch (Exception e) { + log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage()); + // 继续处理下一页 + } + } + + // 处理最后一个段落 + if (!currentParagraph.isEmpty()) { + String paragraphText = String.join("", currentParagraph); + if (paragraphText.length() >= minParagraphLength) { + paragraphs.add(paragraphText.trim()); + } + } + + document.close(); + log.info("PDF解析完成,提取段落数: {}", paragraphs.size()); + + } catch (IOException e) { + log.error("提取PDF段落失败: {}", e.getMessage(), e); + } + + return paragraphs; + } + + /** + * 检查PDF文件是否可被有效解析 + * + * @param filePath PDF文件路径 + * @return 是否可解析 + */ + public static boolean isValidPdf(String filePath) { + File pdfFile = new File(filePath); + if (!pdfFile.exists() || !pdfFile.isFile()) { + return false; + } + + try { + PDDocument document = PDDocument.load(pdfFile); + int pageCount = document.getNumberOfPages(); + document.close(); + return pageCount > 0; + } catch (Exception e) { + log.error("检查PDF有效性时出错: {}", e.getMessage()); + return false; + } + } + + /** + * 用于提取文本块的PDFTextStripper + */ + private static class TextBlockStripper extends PDFTextStripper { + private final List textBlocks = new ArrayList<>(); + private float lastY = -1; + private String currentLine = ""; + private float currentX = 0; + + public TextBlockStripper() throws IOException { + super(); + // 初始化 + textBlocks.clear(); + lastY = -1; + currentLine = ""; + currentX = 0; + } + + @Override + protected void processTextPosition(TextPosition text) { + try { + float textX = text.getXDirAdj(); + float textY = text.getYDirAdj(); + + // 如果Y坐标变化超过一定阈值,认为是新行 + if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { + // 保存上一行 + if (!currentLine.trim().isEmpty()) { + textBlocks.add(new TextBlock(currentX, lastY, currentLine)); + } + + // 开始新行 + currentLine = text.getUnicode(); + currentX = textX; + lastY = textY; + } else { + // 在同一行,追加文本 + currentLine += text.getUnicode(); + } + } catch (Exception e) { + // 忽略单个字符处理错误,继续处理其他字符 + log.debug("处理文本位置时出错: {}", e.getMessage()); + } + + super.processTextPosition(text); + } + + @Override + protected void startPage(PDPage page) throws IOException { + // 清空textBlocks列表,避免累积所有页面的内容 + textBlocks.clear(); + lastY = -1; + currentLine = ""; + currentX = 0; + super.startPage(page); + } + + @Override + public void endDocument(PDDocument document) throws IOException { + // 保存最后一行 + if (!currentLine.trim().isEmpty()) { + textBlocks.add(new TextBlock(currentX, lastY, currentLine)); + } + super.endDocument(document); + } + + public List getTextBlocks() { + return textBlocks; + } + } + + /** + * 用于存储文本块信息的类 + */ + private static class TextBlock { + private final float x; + private final float y; + private final String text; + + public TextBlock(float x, float y, String text) { + this.x = x; + this.y = y; + this.text = text; + } + + public float getX() { + return x; + } + + public float getY() { + return y; + } + + public String getText() { + return text; + } + } +}