sjj 功能更新与优化3

5 months ago · e3d69d5974
3 changed files with 713 additions and 0 deletions
--- a/ruoyi-admin/pom.xml
+++ b/ruoyi-admin/pom.xml
@ -197,6 +197,14 @@
 <!--            <version>${与你的agent探针版本保持一致}</version>-->
 <!--        </dependency>-->

+        <!-- PDFBox 依赖 -->
+        <dependency>
+            <groupId>org.apache.pdfbox</groupId>
+            <artifactId>pdfbox</artifactId>
+            <version>2.0.27</version>
+            <scope>test</scope>
+        </dependency>
+
    </dependencies>

    <build>
--- a/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
+++ b/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
@ -0,0 +1,332 @@
+package org.dromara.test;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.api.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
+/**
+ * PDF段落提取测试
+ */
+@DisplayName("PDF段落提取测试")
+public class PdfExtractorTest {
+
+    private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName());
+
+    // 段落最小字数阈值
+    private static final int MIN_PARAGRAPH_LENGTH = 20;
+    // 最大缩进值
+    private static final float MAX_INDENT_X = 100f;
+    // 容差范围
+    private static final float TOLERANCE = 2f;
+
+    @Test
+    @DisplayName("测试PDF段落提取")
+    public void testExtractParagraphs() {
+        String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
+        List<String> paragraphs = extractParagraphsFromPdf(pdfPath);
+
+        System.out.println("提取段落总数: " + paragraphs.size());
+        for (int i = 0; i < paragraphs.size(); i++) {
+            if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){
+                System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim());
+            }
+        }
+    }
+
+    /**
+     * 从PDF文件中提取段落，基于x坐标统计来判断段落
+     */
+    public List<String> extractParagraphsFromPdf(String filePath) {
+        List<String> paragraphs = new ArrayList<>();
+        File pdfFile = new File(filePath);
+
+        try {
+            // 打开PDF文档
+            PDDocument document = PDDocument.load(pdfFile);
+            int totalPages = document.getNumberOfPages();
+
+            // 第一步：收集所有x坐标和重复文本
+            List<Float> xCoordinates = new ArrayList<>();
+            Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
+
+            // 遍历每一页收集X坐标
+            for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+                final int currentPage = pageIndex; // 用于匿名类中引用
+
+                // 为每页创建文本提取器
+                TextBlockStripper stripper = new TextBlockStripper();
+                stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本，这可能导致中文文本丢失
+                stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
+                stripper.setStartPage(pageIndex + 1);
+                stripper.setEndPage(pageIndex + 1);
+                stripper.setSortByPosition(true);
+                stripper.getText(document);
+
+                for (TextBlock block : stripper.getTextBlocks()) {
+                    String text = block.getText().trim();
+                    if (text.length() > 0) {
+                        // 统计X坐标
+                        if (block.getX() < MAX_INDENT_X) {
+                            xCoordinates.add(block.getX());
+                        }
+
+                        // 统计文本频率
+                        if (text.length() >= MIN_PARAGRAPH_LENGTH) {
+                            textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
+                        }
+                    }
+                }
+            }
+
+            if (xCoordinates.isEmpty()) {
+                document.close();
+                return paragraphs;
+            }
+
+            // 找出频率超过页面数一半的文本
+            int frequencyThreshold = totalPages / 2;
+            Set<String> frequentTexts = textFrequency.entrySet().stream()
+                .filter(entry -> entry.getValue() > frequencyThreshold)
+                .map(Map.Entry::getKey)
+                .collect(Collectors.toSet());
+
+            System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)");
+            frequentTexts.forEach(text -> System.out.println("高频文本: " +
+                (text.length() > 50 ? text.substring(0, 47) + "..." : text) +
+                " 出现次数: " + textFrequency.get(text)));
+
+            // 第二步：统计x坐标频率并找出前两名
+            Map<Float, Long> xCounter = xCoordinates.stream()
+                .collect(Collectors.groupingBy(x -> x, Collectors.counting()));
+
+            List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream()
+                .sorted(Map.Entry.<Float, Long>comparingByValue().reversed())
+                .limit(2)
+                .collect(Collectors.toList());
+
+            if (mostCommonX.size() < 2) {
+                document.close();
+                return paragraphs;
+            }
+
+            // 确保x_indent > x_normal
+            float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
+            float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
+
+            System.out.println("最终使用的坐标值：x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE);
+
+            // 第三步：根据基准x坐标提取段落
+            List<String> currentParagraph = new ArrayList<>();
+            int num=311;
+
+            // 逐页处理文本块
+            for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+                List<TextBlock> pageTextBlocks = new ArrayList<>();
+
+                // 为每页创建文本提取器
+                TextBlockStripper stripper = new TextBlockStripper();
+                stripper.setSortByPosition(true);
+                stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本，这可能导致中文文本丢失
+                stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
+                stripper.setStartPage(pageIndex + 1);
+                stripper.setEndPage(pageIndex + 1);
+                stripper.getText(document);
+
+                // 获取当前页的文本块并排序
+                pageTextBlocks.addAll(stripper.getTextBlocks());
+                pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
+
+                if(pageIndex==num){
+                    System.out.println(pageTextBlocks);
+                }
+
+                // 处理当前页的文本块
+                for (TextBlock block : pageTextBlocks) {
+                    String lineText = block.getText().trim().replace('\n', ' ').trim();
+                    if (lineText.isEmpty()) {
+                        continue;
+                    }
+
+                    // 过滤高频文本
+                    if (frequentTexts.contains(lineText)) {
+                        if (pageIndex == num) {
+                            System.out.println("过滤高频文本: " +
+                                (lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText));
+                        }
+                        continue;
+                    }
+
+                    float currentX = block.getX();
+
+                    // 判断当前x坐标属于哪种类型
+                    boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE;
+                    boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE;
+
+                    // 如果是缩进位置，说明是新段落的开始
+                    if (isIndent) {
+                        if (!currentParagraph.isEmpty()) {
+                            String paragraphText = String.join("", currentParagraph);
+                            if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
+                                paragraphs.add(paragraphText);
+                            }
+                            currentParagraph.clear();
+                        }
+                        if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
+                            currentParagraph.add(lineText);
+                        }
+                    }
+                    // 如果是正常位置，追加到当前段落
+                    else if (isNormal) {
+                        if (currentParagraph.isEmpty()) {  // 如果还没有段落，创建新段落
+                            currentParagraph.add(lineText);
+                        } else {
+                            currentParagraph.add(lineText);
+                        }
+                    }
+                    // 如果既不是缩进也不是正常位置，作为独立段落
+                    else {
+                        // 如果独立段落字数满足要求进行统计，不满足要求跳过
+                        if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
+                            if (!currentParagraph.isEmpty()) {
+                                String paragraphText = String.join("", currentParagraph);
+                                if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
+                                    paragraphs.add(paragraphText);
+                                }
+                                currentParagraph.clear();
+                            }
+                            paragraphs.add(lineText);
+                        }
+                    }
+                }
+            }
+
+            // 处理最后一个段落
+            if (!currentParagraph.isEmpty()) {
+                String paragraphText = String.join("", currentParagraph);
+                if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
+                    paragraphs.add(paragraphText);
+                }
+            }
+
+            document.close();
+
+        } catch (IOException e) {
+            logger.severe("提取PDF段落失败: " + e.getMessage());
+            e.printStackTrace();
+        }
+
+        return paragraphs;
+    }
+
+    /**
+     * 用于提取文本块的PDFTextStripper
+     */
+    private static class TextBlockStripper extends PDFTextStripper {
+        private final List<TextBlock> textBlocks = new ArrayList<>();
+        private float lastY = -1;
+        private String currentLine = "";
+        private float currentX = 0;
+
+        public TextBlockStripper() throws IOException {
+            super();
+            // 初始化
+            textBlocks.clear();
+            lastY = -1;
+            currentLine = "";
+            currentX = 0;
+        }
+
+        @Override
+        protected void processTextPosition(TextPosition text) {
+            float textX = text.getXDirAdj();
+            float textY = text.getYDirAdj();
+            float endX = text.getEndX();
+            float endY = text.getEndY();
+            // 如果Y坐标变化超过一定阈值，认为是新行
+            if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
+                // 保存上一行
+                if (!currentLine.trim().isEmpty()) {
+                    textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+                }
+
+                // 开始新行
+                currentLine = text.getUnicode();
+                currentX = textX;
+                lastY = textY;
+            } else {
+                // 在同一行，追加文本
+                currentLine += text.getUnicode();
+            }
+
+            super.processTextPosition(text);
+        }
+
+        @Override
+        protected void startPage(PDPage page) throws IOException {
+            // 清空textBlocks列表，避免累积所有页面的内容
+            textBlocks.clear();
+            lastY = -1;
+            currentLine = "";
+            currentX = 0;
+            super.startPage(page);
+        }
+
+        @Override
+        public void endDocument(PDDocument document) throws IOException {
+            // 保存最后一行
+            if (!currentLine.trim().isEmpty()) {
+                textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+            }
+            super.endDocument(document);
+        }
+
+        public List<TextBlock> getTextBlocks() {
+            return textBlocks;
+        }
+    }
+
+    /**
+     * 用于存储文本块信息的类
+     */
+    private static class TextBlock {
+        private final float x;
+        private final float y;
+        private final String text;
+
+        public TextBlock(float x, float y, String text) {
+            this.x = x;
+            this.y = y;
+            this.text = text;
+        }
+
+        public float getX() {
+            return x;
+        }
+
+        public float getY() {
+            return y;
+        }
+
+        public String getText() {
+            return text;
+        }
+
+        @Override
+        public String toString() {
+            return "TextBlock{" +
+                "x=" + x +
+                ", y=" + y +
+                ", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' +
+                '}';
+        }
+    }
+}
--- a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java
@ -0,0 +1,373 @@
+package org.dromara.productManagement.utils;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.pdfbox.rendering.ImageType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * PDF解析工具类
+ * 用于提取PDF文档中的段落，支持首行缩进识别和高频文本过滤
+ *    // 使用默认参数
+ *    List<String> paragraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf");
+ *
+ *    // 使用自定义参数
+ *    List<String> customParagraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf",
+ *        30,  // 最小段落长度
+ *        120, // 最大缩进值
+ *        3.0f // 容差范围
+ *    );
+ */
+public class PdfParserUtils {
+
+    private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class);
+
+    // 默认段落最小字数阈值
+    private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20;
+    // 默认最大缩进值
+    private static final float DEFAULT_MAX_INDENT_X = 100f;
+    // 默认容差范围
+    private static final float DEFAULT_TOLERANCE = 2f;
+
+    /**
+     * 从PDF文件中提取段落
+     *
+     * @param filePath PDF文件路径
+     * @return 提取的段落列表
+     */
+    public static List<String> extractParagraphs(String filePath) {
+        return extractParagraphs(filePath, DEFAULT_MIN_PARAGRAPH_LENGTH, DEFAULT_MAX_INDENT_X, DEFAULT_TOLERANCE);
+    }
+
+    /**
+     * 从PDF文件中提取段落，支持自定义参数
+     *
+     * @param filePath PDF文件路径
+     * @param minParagraphLength 最小段落长度
+     * @param maxIndentX 最大缩进值
+     * @param tolerance 容差范围
+     * @return 提取的段落列表
+     */
+    public static List<String> extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) {
+        List<String> paragraphs = new ArrayList<>();
+        File pdfFile = new File(filePath);
+
+        if (!pdfFile.exists() || !pdfFile.isFile()) {
+            log.error("PDF文件不存在: {}", filePath);
+            return paragraphs;
+        }
+
+        try {
+            // 设置PDFBox选项，抑制字体警告
+            System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
+            
+            // 打开PDF文档
+            PDDocument document = PDDocument.load(pdfFile);
+            
+            // 设置PDFBox参数，提高对中文字体的兼容性
+            document.setResourceCache(null); // 禁用资源缓存，可能减少某些字体问题
+            
+            int totalPages = document.getNumberOfPages();
+
+            // 第一步：收集所有x坐标和重复文本
+            List<Float> xCoordinates = new ArrayList<>();
+            Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
+
+            log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages);
+
+            // 遍历每一页收集X坐标
+            for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+                try {
+                    // 为每页创建文本提取器
+                    TextBlockStripper stripper = new TextBlockStripper();
+                    stripper.setSortByPosition(true);
+                    stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本，这可能导致中文文本丢失
+                    stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
+                    stripper.setStartPage(pageIndex + 1);
+                    stripper.setEndPage(pageIndex + 1);
+                    stripper.getText(document);
+                    
+                    for (TextBlock block : stripper.getTextBlocks()) {
+                        String text = block.getText().trim();
+                        if (text.length() > 0) {
+                            // 统计X坐标
+                            if (block.getX() < maxIndentX) {
+                                xCoordinates.add(block.getX());
+                            }
+                            
+                            // 统计文本频率
+                            if (text.length() >= minParagraphLength) {
+                                textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
+                            }
+                        }
+                    }
+                } catch (Exception e) {
+                    log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage());
+                    // 继续处理下一页，而不是中断整个过程
+                }
+            }
+
+            if (xCoordinates.isEmpty()) {
+                log.warn("未找到有效的X坐标，无法提取段落");
+                document.close();
+                return paragraphs;
+            }
+
+            // 找出频率超过页面数一半的文本（通常是页眉页脚等重复内容）
+            int frequencyThreshold = totalPages / 2;
+            Set<String> frequentTexts = textFrequency.entrySet().stream()
+                .filter(entry -> entry.getValue() > frequencyThreshold)
+                .map(Map.Entry::getKey)
+                .collect(Collectors.toSet());
+
+            log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold);
+
+            // 统计x坐标频率并找出前两名（通常是正常段落和首行缩进）
+            Map<Float, Long> xCounter = xCoordinates.stream()
+                .collect(Collectors.groupingBy(x -> x, Collectors.counting()));
+
+            List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream()
+                .sorted(Map.Entry.<Float, Long>comparingByValue().reversed())
+                .limit(2)
+                .collect(Collectors.toList());
+
+            if (mostCommonX.size() < 2) {
+                log.warn("未找到足够的X坐标特征，无法区分段落缩进");
+                document.close();
+                return paragraphs;
+            }
+
+            // 确保x_indent > x_normal
+            float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
+            float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
+
+            log.info("使用的坐标值：x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance);
+
+            // 根据基准x坐标提取段落
+            List<String> currentParagraph = new ArrayList<>();
+
+            // 逐页处理文本块
+            for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+                try {
+                    List<TextBlock> pageTextBlocks = new ArrayList<>();
+                    
+                    // 为每页创建文本提取器
+                    TextBlockStripper stripper = new TextBlockStripper();
+                    stripper.setSortByPosition(true);
+                    stripper.setSuppressDuplicateOverlappingText(false);
+                    stripper.setAddMoreFormatting(false);
+                    stripper.setStartPage(pageIndex + 1);
+                    stripper.setEndPage(pageIndex + 1);
+                    stripper.getText(document);
+                    
+                    // 获取当前页的文本块并排序
+                    pageTextBlocks.addAll(stripper.getTextBlocks());
+                    pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
+                    
+                    // 处理当前页的文本块
+                    for (TextBlock block : pageTextBlocks) {
+                        String lineText = block.getText().trim().replace('\n', ' ').trim();
+                        if (lineText.isEmpty()) {
+                            continue;
+                        }
+                        
+                        // 过滤高频文本
+                        if (frequentTexts.contains(lineText)) {
+                            continue;
+                        }
+                        
+                        float currentX = block.getX();
+                        
+                        // 判断当前x坐标属于哪种类型
+                        boolean isIndent = Math.abs(currentX - xIndent) <= tolerance;
+                        boolean isNormal = Math.abs(currentX - xNormal) <= tolerance;
+                        
+                        // 如果是缩进位置，说明是新段落的开始
+                        if (isIndent) {
+                            if (!currentParagraph.isEmpty()) {
+                                String paragraphText = String.join("", currentParagraph);
+                                if (paragraphText.length() >= minParagraphLength) {
+                                    paragraphs.add(paragraphText.trim());
+                                }
+                                currentParagraph.clear();
+                            }
+                            if (lineText.length() >= minParagraphLength) {
+                                currentParagraph.add(lineText);
+                            }
+                        }
+                        // 如果是正常位置，追加到当前段落
+                        else if (isNormal) {
+                            if (currentParagraph.isEmpty()) {  // 如果还没有段落，创建新段落
+                                currentParagraph.add(lineText);
+                            } else {
+                                currentParagraph.add(lineText);
+                            }
+                        }
+                        // 如果既不是缩进也不是正常位置，作为独立段落
+                        else {
+                            // 如果独立段落字数满足要求进行统计，不满足要求跳过
+                            if (lineText.length() >= minParagraphLength) {
+                                if (!currentParagraph.isEmpty()) {
+                                    String paragraphText = String.join("", currentParagraph);
+                                    if (paragraphText.length() >= minParagraphLength) {
+                                        paragraphs.add(paragraphText.trim());
+                                    }
+                                    currentParagraph.clear();
+                                }
+                                paragraphs.add(lineText.trim());
+                            }
+                        }
+                    }
+                } catch (Exception e) {
+                    log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage());
+                    // 继续处理下一页
+                }
+            }
+            
+            // 处理最后一个段落
+            if (!currentParagraph.isEmpty()) {
+                String paragraphText = String.join("", currentParagraph);
+                if (paragraphText.length() >= minParagraphLength) {
+                    paragraphs.add(paragraphText.trim());
+                }
+            }
+            
+            document.close();
+            log.info("PDF解析完成，提取段落数: {}", paragraphs.size());
+            
+        } catch (IOException e) {
+            log.error("提取PDF段落失败: {}", e.getMessage(), e);
+        }
+        
+        return paragraphs;
+    }
+
+    /**
+     * 检查PDF文件是否可被有效解析
+     * 
+     * @param filePath PDF文件路径
+     * @return 是否可解析
+     */
+    public static boolean isValidPdf(String filePath) {
+        File pdfFile = new File(filePath);
+        if (!pdfFile.exists() || !pdfFile.isFile()) {
+            return false;
+        }
+        
+        try {
+            PDDocument document = PDDocument.load(pdfFile);
+            int pageCount = document.getNumberOfPages();
+            document.close();
+            return pageCount > 0;
+        } catch (Exception e) {
+            log.error("检查PDF有效性时出错: {}", e.getMessage());
+            return false;
+        }
+    }
+
+    /**
+     * 用于提取文本块的PDFTextStripper
+     */
+    private static class TextBlockStripper extends PDFTextStripper {
+        private final List<TextBlock> textBlocks = new ArrayList<>();
+        private float lastY = -1;
+        private String currentLine = "";
+        private float currentX = 0;
+        
+        public TextBlockStripper() throws IOException {
+            super();
+            // 初始化
+            textBlocks.clear();
+            lastY = -1;
+            currentLine = "";
+            currentX = 0;
+        }
+        
+        @Override
+        protected void processTextPosition(TextPosition text) {
+            try {
+                float textX = text.getXDirAdj();
+                float textY = text.getYDirAdj();
+                
+                // 如果Y坐标变化超过一定阈值，认为是新行
+                if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
+                    // 保存上一行
+                    if (!currentLine.trim().isEmpty()) {
+                        textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+                    }
+                    
+                    // 开始新行
+                    currentLine = text.getUnicode();
+                    currentX = textX;
+                    lastY = textY;
+                } else {
+                    // 在同一行，追加文本
+                    currentLine += text.getUnicode();
+                }
+            } catch (Exception e) {
+                // 忽略单个字符处理错误，继续处理其他字符
+                log.debug("处理文本位置时出错: {}", e.getMessage());
+            }
+            
+            super.processTextPosition(text);
+        }
+        
+        @Override
+        protected void startPage(PDPage page) throws IOException {
+            // 清空textBlocks列表，避免累积所有页面的内容
+            textBlocks.clear();
+            lastY = -1;
+            currentLine = "";
+            currentX = 0;
+            super.startPage(page);
+        }
+        
+        @Override
+        public void endDocument(PDDocument document) throws IOException {
+            // 保存最后一行
+            if (!currentLine.trim().isEmpty()) {
+                textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+            }
+            super.endDocument(document);
+        }
+        
+        public List<TextBlock> getTextBlocks() {
+            return textBlocks;
+        }
+    }
+
+    /**
+     * 用于存储文本块信息的类
+     */
+    private static class TextBlock {
+        private final float x;
+        private final float y;
+        private final String text;
+        
+        public TextBlock(float x, float y, String text) {
+            this.x = x;
+            this.y = y;
+            this.text = text;
+        }
+        
+        public float getX() {
+            return x;
+        }
+        
+        public float getY() {
+            return y;
+        }
+        
+        public String getText() {
+            return text;
+        }
+    }
+}