diff --git a/ruoyi-admin/pom.xml b/ruoyi-admin/pom.xml
index 9c26130..f3592cb 100644
--- a/ruoyi-admin/pom.xml
+++ b/ruoyi-admin/pom.xml
@@ -197,6 +197,14 @@
+
+
+ org.apache.pdfbox
+ pdfbox
+ 2.0.27
+ test
+
+
diff --git a/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java b/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
new file mode 100644
index 0000000..cccd676
--- /dev/null
+++ b/ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
@@ -0,0 +1,332 @@
+package org.dromara.test;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.api.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+import java.util.logging.Logger;
+import java.util.stream.Collectors;
+
+/**
+ * PDF段落提取测试
+ */
+@DisplayName("PDF段落提取测试")
+public class PdfExtractorTest {
+
+ private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName());
+
+ // 段落最小字数阈值
+ private static final int MIN_PARAGRAPH_LENGTH = 20;
+ // 最大缩进值
+ private static final float MAX_INDENT_X = 100f;
+ // 容差范围
+ private static final float TOLERANCE = 2f;
+
+ @Test
+ @DisplayName("测试PDF段落提取")
+ public void testExtractParagraphs() {
+ String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
+ List paragraphs = extractParagraphsFromPdf(pdfPath);
+
+ System.out.println("提取段落总数: " + paragraphs.size());
+ for (int i = 0; i < paragraphs.size(); i++) {
+ if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){
+ System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim());
+ }
+ }
+ }
+
+ /**
+ * 从PDF文件中提取段落,基于x坐标统计来判断段落
+ */
+ public List extractParagraphsFromPdf(String filePath) {
+ List paragraphs = new ArrayList<>();
+ File pdfFile = new File(filePath);
+
+ try {
+ // 打开PDF文档
+ PDDocument document = PDDocument.load(pdfFile);
+ int totalPages = document.getNumberOfPages();
+
+ // 第一步:收集所有x坐标和重复文本
+ List xCoordinates = new ArrayList<>();
+ Map textFrequency = new HashMap<>(); // 记录文本出现频率
+
+ // 遍历每一页收集X坐标
+ for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+ final int currentPage = pageIndex; // 用于匿名类中引用
+
+ // 为每页创建文本提取器
+ TextBlockStripper stripper = new TextBlockStripper();
+ stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
+ stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
+ stripper.setStartPage(pageIndex + 1);
+ stripper.setEndPage(pageIndex + 1);
+ stripper.setSortByPosition(true);
+ stripper.getText(document);
+
+ for (TextBlock block : stripper.getTextBlocks()) {
+ String text = block.getText().trim();
+ if (text.length() > 0) {
+ // 统计X坐标
+ if (block.getX() < MAX_INDENT_X) {
+ xCoordinates.add(block.getX());
+ }
+
+ // 统计文本频率
+ if (text.length() >= MIN_PARAGRAPH_LENGTH) {
+ textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
+ }
+ }
+ }
+ }
+
+ if (xCoordinates.isEmpty()) {
+ document.close();
+ return paragraphs;
+ }
+
+ // 找出频率超过页面数一半的文本
+ int frequencyThreshold = totalPages / 2;
+ Set frequentTexts = textFrequency.entrySet().stream()
+ .filter(entry -> entry.getValue() > frequencyThreshold)
+ .map(Map.Entry::getKey)
+ .collect(Collectors.toSet());
+
+ System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)");
+ frequentTexts.forEach(text -> System.out.println("高频文本: " +
+ (text.length() > 50 ? text.substring(0, 47) + "..." : text) +
+ " 出现次数: " + textFrequency.get(text)));
+
+ // 第二步:统计x坐标频率并找出前两名
+ Map xCounter = xCoordinates.stream()
+ .collect(Collectors.groupingBy(x -> x, Collectors.counting()));
+
+ List> mostCommonX = xCounter.entrySet().stream()
+ .sorted(Map.Entry.comparingByValue().reversed())
+ .limit(2)
+ .collect(Collectors.toList());
+
+ if (mostCommonX.size() < 2) {
+ document.close();
+ return paragraphs;
+ }
+
+ // 确保x_indent > x_normal
+ float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
+ float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
+
+ System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE);
+
+ // 第三步:根据基准x坐标提取段落
+ List currentParagraph = new ArrayList<>();
+ int num=311;
+
+ // 逐页处理文本块
+ for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+ List pageTextBlocks = new ArrayList<>();
+
+ // 为每页创建文本提取器
+ TextBlockStripper stripper = new TextBlockStripper();
+ stripper.setSortByPosition(true);
+ stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
+ stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
+ stripper.setStartPage(pageIndex + 1);
+ stripper.setEndPage(pageIndex + 1);
+ stripper.getText(document);
+
+ // 获取当前页的文本块并排序
+ pageTextBlocks.addAll(stripper.getTextBlocks());
+ pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
+
+ if(pageIndex==num){
+ System.out.println(pageTextBlocks);
+ }
+
+ // 处理当前页的文本块
+ for (TextBlock block : pageTextBlocks) {
+ String lineText = block.getText().trim().replace('\n', ' ').trim();
+ if (lineText.isEmpty()) {
+ continue;
+ }
+
+ // 过滤高频文本
+ if (frequentTexts.contains(lineText)) {
+ if (pageIndex == num) {
+ System.out.println("过滤高频文本: " +
+ (lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText));
+ }
+ continue;
+ }
+
+ float currentX = block.getX();
+
+ // 判断当前x坐标属于哪种类型
+ boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE;
+ boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE;
+
+ // 如果是缩进位置,说明是新段落的开始
+ if (isIndent) {
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
+ paragraphs.add(paragraphText);
+ }
+ currentParagraph.clear();
+ }
+ if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
+ currentParagraph.add(lineText);
+ }
+ }
+ // 如果是正常位置,追加到当前段落
+ else if (isNormal) {
+ if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
+ currentParagraph.add(lineText);
+ } else {
+ currentParagraph.add(lineText);
+ }
+ }
+ // 如果既不是缩进也不是正常位置,作为独立段落
+ else {
+ // 如果独立段落字数满足要求进行统计,不满足要求跳过
+ if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
+ paragraphs.add(paragraphText);
+ }
+ currentParagraph.clear();
+ }
+ paragraphs.add(lineText);
+ }
+ }
+ }
+ }
+
+ // 处理最后一个段落
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
+ paragraphs.add(paragraphText);
+ }
+ }
+
+ document.close();
+
+ } catch (IOException e) {
+ logger.severe("提取PDF段落失败: " + e.getMessage());
+ e.printStackTrace();
+ }
+
+ return paragraphs;
+ }
+
+ /**
+ * 用于提取文本块的PDFTextStripper
+ */
+ private static class TextBlockStripper extends PDFTextStripper {
+ private final List textBlocks = new ArrayList<>();
+ private float lastY = -1;
+ private String currentLine = "";
+ private float currentX = 0;
+
+ public TextBlockStripper() throws IOException {
+ super();
+ // 初始化
+ textBlocks.clear();
+ lastY = -1;
+ currentLine = "";
+ currentX = 0;
+ }
+
+ @Override
+ protected void processTextPosition(TextPosition text) {
+ float textX = text.getXDirAdj();
+ float textY = text.getYDirAdj();
+ float endX = text.getEndX();
+ float endY = text.getEndY();
+ // 如果Y坐标变化超过一定阈值,认为是新行
+ if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
+ // 保存上一行
+ if (!currentLine.trim().isEmpty()) {
+ textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+ }
+
+ // 开始新行
+ currentLine = text.getUnicode();
+ currentX = textX;
+ lastY = textY;
+ } else {
+ // 在同一行,追加文本
+ currentLine += text.getUnicode();
+ }
+
+ super.processTextPosition(text);
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ // 清空textBlocks列表,避免累积所有页面的内容
+ textBlocks.clear();
+ lastY = -1;
+ currentLine = "";
+ currentX = 0;
+ super.startPage(page);
+ }
+
+ @Override
+ public void endDocument(PDDocument document) throws IOException {
+ // 保存最后一行
+ if (!currentLine.trim().isEmpty()) {
+ textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+ }
+ super.endDocument(document);
+ }
+
+ public List getTextBlocks() {
+ return textBlocks;
+ }
+ }
+
+ /**
+ * 用于存储文本块信息的类
+ */
+ private static class TextBlock {
+ private final float x;
+ private final float y;
+ private final String text;
+
+ public TextBlock(float x, float y, String text) {
+ this.x = x;
+ this.y = y;
+ this.text = text;
+ }
+
+ public float getX() {
+ return x;
+ }
+
+ public float getY() {
+ return y;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ @Override
+ public String toString() {
+ return "TextBlock{" +
+ "x=" + x +
+ ", y=" + y +
+ ", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' +
+ '}';
+ }
+ }
+}
diff --git a/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java
new file mode 100644
index 0000000..274b7fa
--- /dev/null
+++ b/zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java
@@ -0,0 +1,373 @@
+package org.dromara.productManagement.utils;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.pdfbox.rendering.ImageType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * PDF解析工具类
+ * 用于提取PDF文档中的段落,支持首行缩进识别和高频文本过滤
+ * // 使用默认参数
+ * List paragraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf");
+ *
+ * // 使用自定义参数
+ * List customParagraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf",
+ * 30, // 最小段落长度
+ * 120, // 最大缩进值
+ * 3.0f // 容差范围
+ * );
+ */
+public class PdfParserUtils {
+
+ private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class);
+
+ // 默认段落最小字数阈值
+ private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20;
+ // 默认最大缩进值
+ private static final float DEFAULT_MAX_INDENT_X = 100f;
+ // 默认容差范围
+ private static final float DEFAULT_TOLERANCE = 2f;
+
+ /**
+ * 从PDF文件中提取段落
+ *
+ * @param filePath PDF文件路径
+ * @return 提取的段落列表
+ */
+ public static List extractParagraphs(String filePath) {
+ return extractParagraphs(filePath, DEFAULT_MIN_PARAGRAPH_LENGTH, DEFAULT_MAX_INDENT_X, DEFAULT_TOLERANCE);
+ }
+
+ /**
+ * 从PDF文件中提取段落,支持自定义参数
+ *
+ * @param filePath PDF文件路径
+ * @param minParagraphLength 最小段落长度
+ * @param maxIndentX 最大缩进值
+ * @param tolerance 容差范围
+ * @return 提取的段落列表
+ */
+ public static List extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) {
+ List paragraphs = new ArrayList<>();
+ File pdfFile = new File(filePath);
+
+ if (!pdfFile.exists() || !pdfFile.isFile()) {
+ log.error("PDF文件不存在: {}", filePath);
+ return paragraphs;
+ }
+
+ try {
+ // 设置PDFBox选项,抑制字体警告
+ System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
+
+ // 打开PDF文档
+ PDDocument document = PDDocument.load(pdfFile);
+
+ // 设置PDFBox参数,提高对中文字体的兼容性
+ document.setResourceCache(null); // 禁用资源缓存,可能减少某些字体问题
+
+ int totalPages = document.getNumberOfPages();
+
+ // 第一步:收集所有x坐标和重复文本
+ List xCoordinates = new ArrayList<>();
+ Map textFrequency = new HashMap<>(); // 记录文本出现频率
+
+ log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages);
+
+ // 遍历每一页收集X坐标
+ for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+ try {
+ // 为每页创建文本提取器
+ TextBlockStripper stripper = new TextBlockStripper();
+ stripper.setSortByPosition(true);
+ stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
+ stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
+ stripper.setStartPage(pageIndex + 1);
+ stripper.setEndPage(pageIndex + 1);
+ stripper.getText(document);
+
+ for (TextBlock block : stripper.getTextBlocks()) {
+ String text = block.getText().trim();
+ if (text.length() > 0) {
+ // 统计X坐标
+ if (block.getX() < maxIndentX) {
+ xCoordinates.add(block.getX());
+ }
+
+ // 统计文本频率
+ if (text.length() >= minParagraphLength) {
+ textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
+ }
+ }
+ }
+ } catch (Exception e) {
+ log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage());
+ // 继续处理下一页,而不是中断整个过程
+ }
+ }
+
+ if (xCoordinates.isEmpty()) {
+ log.warn("未找到有效的X坐标,无法提取段落");
+ document.close();
+ return paragraphs;
+ }
+
+ // 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容)
+ int frequencyThreshold = totalPages / 2;
+ Set frequentTexts = textFrequency.entrySet().stream()
+ .filter(entry -> entry.getValue() > frequencyThreshold)
+ .map(Map.Entry::getKey)
+ .collect(Collectors.toSet());
+
+ log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold);
+
+ // 统计x坐标频率并找出前两名(通常是正常段落和首行缩进)
+ Map xCounter = xCoordinates.stream()
+ .collect(Collectors.groupingBy(x -> x, Collectors.counting()));
+
+ List> mostCommonX = xCounter.entrySet().stream()
+ .sorted(Map.Entry.comparingByValue().reversed())
+ .limit(2)
+ .collect(Collectors.toList());
+
+ if (mostCommonX.size() < 2) {
+ log.warn("未找到足够的X坐标特征,无法区分段落缩进");
+ document.close();
+ return paragraphs;
+ }
+
+ // 确保x_indent > x_normal
+ float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
+ float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
+
+ log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance);
+
+ // 根据基准x坐标提取段落
+ List currentParagraph = new ArrayList<>();
+
+ // 逐页处理文本块
+ for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
+ try {
+ List pageTextBlocks = new ArrayList<>();
+
+ // 为每页创建文本提取器
+ TextBlockStripper stripper = new TextBlockStripper();
+ stripper.setSortByPosition(true);
+ stripper.setSuppressDuplicateOverlappingText(false);
+ stripper.setAddMoreFormatting(false);
+ stripper.setStartPage(pageIndex + 1);
+ stripper.setEndPage(pageIndex + 1);
+ stripper.getText(document);
+
+ // 获取当前页的文本块并排序
+ pageTextBlocks.addAll(stripper.getTextBlocks());
+ pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
+
+ // 处理当前页的文本块
+ for (TextBlock block : pageTextBlocks) {
+ String lineText = block.getText().trim().replace('\n', ' ').trim();
+ if (lineText.isEmpty()) {
+ continue;
+ }
+
+ // 过滤高频文本
+ if (frequentTexts.contains(lineText)) {
+ continue;
+ }
+
+ float currentX = block.getX();
+
+ // 判断当前x坐标属于哪种类型
+ boolean isIndent = Math.abs(currentX - xIndent) <= tolerance;
+ boolean isNormal = Math.abs(currentX - xNormal) <= tolerance;
+
+ // 如果是缩进位置,说明是新段落的开始
+ if (isIndent) {
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= minParagraphLength) {
+ paragraphs.add(paragraphText.trim());
+ }
+ currentParagraph.clear();
+ }
+ if (lineText.length() >= minParagraphLength) {
+ currentParagraph.add(lineText);
+ }
+ }
+ // 如果是正常位置,追加到当前段落
+ else if (isNormal) {
+ if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
+ currentParagraph.add(lineText);
+ } else {
+ currentParagraph.add(lineText);
+ }
+ }
+ // 如果既不是缩进也不是正常位置,作为独立段落
+ else {
+ // 如果独立段落字数满足要求进行统计,不满足要求跳过
+ if (lineText.length() >= minParagraphLength) {
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= minParagraphLength) {
+ paragraphs.add(paragraphText.trim());
+ }
+ currentParagraph.clear();
+ }
+ paragraphs.add(lineText.trim());
+ }
+ }
+ }
+ } catch (Exception e) {
+ log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage());
+ // 继续处理下一页
+ }
+ }
+
+ // 处理最后一个段落
+ if (!currentParagraph.isEmpty()) {
+ String paragraphText = String.join("", currentParagraph);
+ if (paragraphText.length() >= minParagraphLength) {
+ paragraphs.add(paragraphText.trim());
+ }
+ }
+
+ document.close();
+ log.info("PDF解析完成,提取段落数: {}", paragraphs.size());
+
+ } catch (IOException e) {
+ log.error("提取PDF段落失败: {}", e.getMessage(), e);
+ }
+
+ return paragraphs;
+ }
+
+ /**
+ * 检查PDF文件是否可被有效解析
+ *
+ * @param filePath PDF文件路径
+ * @return 是否可解析
+ */
+ public static boolean isValidPdf(String filePath) {
+ File pdfFile = new File(filePath);
+ if (!pdfFile.exists() || !pdfFile.isFile()) {
+ return false;
+ }
+
+ try {
+ PDDocument document = PDDocument.load(pdfFile);
+ int pageCount = document.getNumberOfPages();
+ document.close();
+ return pageCount > 0;
+ } catch (Exception e) {
+ log.error("检查PDF有效性时出错: {}", e.getMessage());
+ return false;
+ }
+ }
+
+ /**
+ * 用于提取文本块的PDFTextStripper
+ */
+ private static class TextBlockStripper extends PDFTextStripper {
+ private final List textBlocks = new ArrayList<>();
+ private float lastY = -1;
+ private String currentLine = "";
+ private float currentX = 0;
+
+ public TextBlockStripper() throws IOException {
+ super();
+ // 初始化
+ textBlocks.clear();
+ lastY = -1;
+ currentLine = "";
+ currentX = 0;
+ }
+
+ @Override
+ protected void processTextPosition(TextPosition text) {
+ try {
+ float textX = text.getXDirAdj();
+ float textY = text.getYDirAdj();
+
+ // 如果Y坐标变化超过一定阈值,认为是新行
+ if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
+ // 保存上一行
+ if (!currentLine.trim().isEmpty()) {
+ textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+ }
+
+ // 开始新行
+ currentLine = text.getUnicode();
+ currentX = textX;
+ lastY = textY;
+ } else {
+ // 在同一行,追加文本
+ currentLine += text.getUnicode();
+ }
+ } catch (Exception e) {
+ // 忽略单个字符处理错误,继续处理其他字符
+ log.debug("处理文本位置时出错: {}", e.getMessage());
+ }
+
+ super.processTextPosition(text);
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ // 清空textBlocks列表,避免累积所有页面的内容
+ textBlocks.clear();
+ lastY = -1;
+ currentLine = "";
+ currentX = 0;
+ super.startPage(page);
+ }
+
+ @Override
+ public void endDocument(PDDocument document) throws IOException {
+ // 保存最后一行
+ if (!currentLine.trim().isEmpty()) {
+ textBlocks.add(new TextBlock(currentX, lastY, currentLine));
+ }
+ super.endDocument(document);
+ }
+
+ public List getTextBlocks() {
+ return textBlocks;
+ }
+ }
+
+ /**
+ * 用于存储文本块信息的类
+ */
+ private static class TextBlock {
+ private final float x;
+ private final float y;
+ private final String text;
+
+ public TextBlock(float x, float y, String text) {
+ this.x = x;
+ this.y = y;
+ this.text = text;
+ }
+
+ public float getX() {
+ return x;
+ }
+
+ public float getY() {
+ return y;
+ }
+
+ public String getText() {
+ return text;
+ }
+ }
+}