3 changed files with 713 additions and 0 deletions
@ -0,0 +1,332 @@ |
|||||
|
package org.dromara.test; |
||||
|
|
||||
|
import org.apache.pdfbox.pdmodel.PDDocument; |
||||
|
import org.apache.pdfbox.pdmodel.PDPage; |
||||
|
import org.apache.pdfbox.text.PDFTextStripper; |
||||
|
import org.apache.pdfbox.text.TextPosition; |
||||
|
import org.junit.jupiter.api.DisplayName; |
||||
|
import org.junit.jupiter.api.Test; |
||||
|
|
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.util.*; |
||||
|
import java.util.logging.Logger; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
/** |
||||
|
* PDF段落提取测试 |
||||
|
*/ |
||||
|
@DisplayName("PDF段落提取测试") |
||||
|
public class PdfExtractorTest { |
||||
|
|
||||
|
private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName()); |
||||
|
|
||||
|
// 段落最小字数阈值
|
||||
|
private static final int MIN_PARAGRAPH_LENGTH = 20; |
||||
|
// 最大缩进值
|
||||
|
private static final float MAX_INDENT_X = 100f; |
||||
|
// 容差范围
|
||||
|
private static final float TOLERANCE = 2f; |
||||
|
|
||||
|
@Test |
||||
|
@DisplayName("测试PDF段落提取") |
||||
|
public void testExtractParagraphs() { |
||||
|
String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
|
||||
|
List<String> paragraphs = extractParagraphsFromPdf(pdfPath); |
||||
|
|
||||
|
System.out.println("提取段落总数: " + paragraphs.size()); |
||||
|
for (int i = 0; i < paragraphs.size(); i++) { |
||||
|
if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){ |
||||
|
System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 从PDF文件中提取段落,基于x坐标统计来判断段落 |
||||
|
*/ |
||||
|
public List<String> extractParagraphsFromPdf(String filePath) { |
||||
|
List<String> paragraphs = new ArrayList<>(); |
||||
|
File pdfFile = new File(filePath); |
||||
|
|
||||
|
try { |
||||
|
// 打开PDF文档
|
||||
|
PDDocument document = PDDocument.load(pdfFile); |
||||
|
int totalPages = document.getNumberOfPages(); |
||||
|
|
||||
|
// 第一步:收集所有x坐标和重复文本
|
||||
|
List<Float> xCoordinates = new ArrayList<>(); |
||||
|
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
|
||||
|
|
||||
|
// 遍历每一页收集X坐标
|
||||
|
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
||||
|
final int currentPage = pageIndex; // 用于匿名类中引用
|
||||
|
|
||||
|
// 为每页创建文本提取器
|
||||
|
TextBlockStripper stripper = new TextBlockStripper(); |
||||
|
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
||||
|
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
||||
|
stripper.setStartPage(pageIndex + 1); |
||||
|
stripper.setEndPage(pageIndex + 1); |
||||
|
stripper.setSortByPosition(true); |
||||
|
stripper.getText(document); |
||||
|
|
||||
|
for (TextBlock block : stripper.getTextBlocks()) { |
||||
|
String text = block.getText().trim(); |
||||
|
if (text.length() > 0) { |
||||
|
// 统计X坐标
|
||||
|
if (block.getX() < MAX_INDENT_X) { |
||||
|
xCoordinates.add(block.getX()); |
||||
|
} |
||||
|
|
||||
|
// 统计文本频率
|
||||
|
if (text.length() >= MIN_PARAGRAPH_LENGTH) { |
||||
|
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (xCoordinates.isEmpty()) { |
||||
|
document.close(); |
||||
|
return paragraphs; |
||||
|
} |
||||
|
|
||||
|
// 找出频率超过页面数一半的文本
|
||||
|
int frequencyThreshold = totalPages / 2; |
||||
|
Set<String> frequentTexts = textFrequency.entrySet().stream() |
||||
|
.filter(entry -> entry.getValue() > frequencyThreshold) |
||||
|
.map(Map.Entry::getKey) |
||||
|
.collect(Collectors.toSet()); |
||||
|
|
||||
|
System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)"); |
||||
|
frequentTexts.forEach(text -> System.out.println("高频文本: " + |
||||
|
(text.length() > 50 ? text.substring(0, 47) + "..." : text) + |
||||
|
" 出现次数: " + textFrequency.get(text))); |
||||
|
|
||||
|
// 第二步:统计x坐标频率并找出前两名
|
||||
|
Map<Float, Long> xCounter = xCoordinates.stream() |
||||
|
.collect(Collectors.groupingBy(x -> x, Collectors.counting())); |
||||
|
|
||||
|
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream() |
||||
|
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed()) |
||||
|
.limit(2) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
if (mostCommonX.size() < 2) { |
||||
|
document.close(); |
||||
|
return paragraphs; |
||||
|
} |
||||
|
|
||||
|
// 确保x_indent > x_normal
|
||||
|
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
|
||||
|
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
|
||||
|
|
||||
|
System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE); |
||||
|
|
||||
|
// 第三步:根据基准x坐标提取段落
|
||||
|
List<String> currentParagraph = new ArrayList<>(); |
||||
|
int num=311; |
||||
|
|
||||
|
// 逐页处理文本块
|
||||
|
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
||||
|
List<TextBlock> pageTextBlocks = new ArrayList<>(); |
||||
|
|
||||
|
// 为每页创建文本提取器
|
||||
|
TextBlockStripper stripper = new TextBlockStripper(); |
||||
|
stripper.setSortByPosition(true); |
||||
|
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
||||
|
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
||||
|
stripper.setStartPage(pageIndex + 1); |
||||
|
stripper.setEndPage(pageIndex + 1); |
||||
|
stripper.getText(document); |
||||
|
|
||||
|
// 获取当前页的文本块并排序
|
||||
|
pageTextBlocks.addAll(stripper.getTextBlocks()); |
||||
|
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); |
||||
|
|
||||
|
if(pageIndex==num){ |
||||
|
System.out.println(pageTextBlocks); |
||||
|
} |
||||
|
|
||||
|
// 处理当前页的文本块
|
||||
|
for (TextBlock block : pageTextBlocks) { |
||||
|
String lineText = block.getText().trim().replace('\n', ' ').trim(); |
||||
|
if (lineText.isEmpty()) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
// 过滤高频文本
|
||||
|
if (frequentTexts.contains(lineText)) { |
||||
|
if (pageIndex == num) { |
||||
|
System.out.println("过滤高频文本: " + |
||||
|
(lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText)); |
||||
|
} |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
float currentX = block.getX(); |
||||
|
|
||||
|
// 判断当前x坐标属于哪种类型
|
||||
|
boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE; |
||||
|
boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE; |
||||
|
|
||||
|
// 如果是缩进位置,说明是新段落的开始
|
||||
|
if (isIndent) { |
||||
|
if (!currentParagraph.isEmpty()) { |
||||
|
String paragraphText = String.join("", currentParagraph); |
||||
|
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
||||
|
paragraphs.add(paragraphText); |
||||
|
} |
||||
|
currentParagraph.clear(); |
||||
|
} |
||||
|
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { |
||||
|
currentParagraph.add(lineText); |
||||
|
} |
||||
|
} |
||||
|
// 如果是正常位置,追加到当前段落
|
||||
|
else if (isNormal) { |
||||
|
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
|
||||
|
currentParagraph.add(lineText); |
||||
|
} else { |
||||
|
currentParagraph.add(lineText); |
||||
|
} |
||||
|
} |
||||
|
// 如果既不是缩进也不是正常位置,作为独立段落
|
||||
|
else { |
||||
|
// 如果独立段落字数满足要求进行统计,不满足要求跳过
|
||||
|
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { |
||||
|
if (!currentParagraph.isEmpty()) { |
||||
|
String paragraphText = String.join("", currentParagraph); |
||||
|
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
||||
|
paragraphs.add(paragraphText); |
||||
|
} |
||||
|
currentParagraph.clear(); |
||||
|
} |
||||
|
paragraphs.add(lineText); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 处理最后一个段落
|
||||
|
if (!currentParagraph.isEmpty()) { |
||||
|
String paragraphText = String.join("", currentParagraph); |
||||
|
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
||||
|
paragraphs.add(paragraphText); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
document.close(); |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
logger.severe("提取PDF段落失败: " + e.getMessage()); |
||||
|
e.printStackTrace(); |
||||
|
} |
||||
|
|
||||
|
return paragraphs; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 用于提取文本块的PDFTextStripper |
||||
|
*/ |
||||
|
private static class TextBlockStripper extends PDFTextStripper { |
||||
|
private final List<TextBlock> textBlocks = new ArrayList<>(); |
||||
|
private float lastY = -1; |
||||
|
private String currentLine = ""; |
||||
|
private float currentX = 0; |
||||
|
|
||||
|
public TextBlockStripper() throws IOException { |
||||
|
super(); |
||||
|
// 初始化
|
||||
|
textBlocks.clear(); |
||||
|
lastY = -1; |
||||
|
currentLine = ""; |
||||
|
currentX = 0; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected void processTextPosition(TextPosition text) { |
||||
|
float textX = text.getXDirAdj(); |
||||
|
float textY = text.getYDirAdj(); |
||||
|
float endX = text.getEndX(); |
||||
|
float endY = text.getEndY(); |
||||
|
// 如果Y坐标变化超过一定阈值,认为是新行
|
||||
|
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { |
||||
|
// 保存上一行
|
||||
|
if (!currentLine.trim().isEmpty()) { |
||||
|
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
||||
|
} |
||||
|
|
||||
|
// 开始新行
|
||||
|
currentLine = text.getUnicode(); |
||||
|
currentX = textX; |
||||
|
lastY = textY; |
||||
|
} else { |
||||
|
// 在同一行,追加文本
|
||||
|
currentLine += text.getUnicode(); |
||||
|
} |
||||
|
|
||||
|
super.processTextPosition(text); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected void startPage(PDPage page) throws IOException { |
||||
|
// 清空textBlocks列表,避免累积所有页面的内容
|
||||
|
textBlocks.clear(); |
||||
|
lastY = -1; |
||||
|
currentLine = ""; |
||||
|
currentX = 0; |
||||
|
super.startPage(page); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void endDocument(PDDocument document) throws IOException { |
||||
|
// 保存最后一行
|
||||
|
if (!currentLine.trim().isEmpty()) { |
||||
|
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
||||
|
} |
||||
|
super.endDocument(document); |
||||
|
} |
||||
|
|
||||
|
public List<TextBlock> getTextBlocks() { |
||||
|
return textBlocks; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 用于存储文本块信息的类 |
||||
|
*/ |
||||
|
private static class TextBlock { |
||||
|
private final float x; |
||||
|
private final float y; |
||||
|
private final String text; |
||||
|
|
||||
|
public TextBlock(float x, float y, String text) { |
||||
|
this.x = x; |
||||
|
this.y = y; |
||||
|
this.text = text; |
||||
|
} |
||||
|
|
||||
|
public float getX() { |
||||
|
return x; |
||||
|
} |
||||
|
|
||||
|
public float getY() { |
||||
|
return y; |
||||
|
} |
||||
|
|
||||
|
public String getText() { |
||||
|
return text; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public String toString() { |
||||
|
return "TextBlock{" + |
||||
|
"x=" + x + |
||||
|
", y=" + y + |
||||
|
", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' + |
||||
|
'}'; |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1,373 @@ |
|||||
|
package org.dromara.productManagement.utils; |
||||
|
|
||||
|
import org.apache.pdfbox.pdmodel.PDDocument; |
||||
|
import org.apache.pdfbox.pdmodel.PDPage; |
||||
|
import org.apache.pdfbox.text.PDFTextStripper; |
||||
|
import org.apache.pdfbox.text.TextPosition; |
||||
|
import org.apache.pdfbox.rendering.ImageType; |
||||
|
import org.slf4j.Logger; |
||||
|
import org.slf4j.LoggerFactory; |
||||
|
|
||||
|
import java.io.File; |
||||
|
import java.io.IOException; |
||||
|
import java.util.*; |
||||
|
import java.util.stream.Collectors; |
||||
|
|
||||
|
/** |
||||
|
* PDF解析工具类 |
||||
|
* 用于提取PDF文档中的段落,支持首行缩进识别和高频文本过滤 |
||||
|
* // 使用默认参数
|
||||
|
* List<String> paragraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf"); |
||||
|
* |
||||
|
* // 使用自定义参数
|
||||
|
* List<String> customParagraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf", |
||||
|
* 30, // 最小段落长度
|
||||
|
* 120, // 最大缩进值
|
||||
|
* 3.0f // 容差范围
|
||||
|
* ); |
||||
|
*/ |
||||
|
public class PdfParserUtils { |
||||
|
|
||||
|
private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class); |
||||
|
|
||||
|
// 默认段落最小字数阈值
|
||||
|
private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20; |
||||
|
// 默认最大缩进值
|
||||
|
private static final float DEFAULT_MAX_INDENT_X = 100f; |
||||
|
// 默认容差范围
|
||||
|
private static final float DEFAULT_TOLERANCE = 2f; |
||||
|
|
||||
|
/** |
||||
|
* 从PDF文件中提取段落 |
||||
|
* |
||||
|
* @param filePath PDF文件路径 |
||||
|
* @return 提取的段落列表 |
||||
|
*/ |
||||
|
public static List<String> extractParagraphs(String filePath) { |
||||
|
return extractParagraphs(filePath, DEFAULT_MIN_PARAGRAPH_LENGTH, DEFAULT_MAX_INDENT_X, DEFAULT_TOLERANCE); |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 从PDF文件中提取段落,支持自定义参数 |
||||
|
* |
||||
|
* @param filePath PDF文件路径 |
||||
|
* @param minParagraphLength 最小段落长度 |
||||
|
* @param maxIndentX 最大缩进值 |
||||
|
* @param tolerance 容差范围 |
||||
|
* @return 提取的段落列表 |
||||
|
*/ |
||||
|
public static List<String> extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) { |
||||
|
List<String> paragraphs = new ArrayList<>(); |
||||
|
File pdfFile = new File(filePath); |
||||
|
|
||||
|
if (!pdfFile.exists() || !pdfFile.isFile()) { |
||||
|
log.error("PDF文件不存在: {}", filePath); |
||||
|
return paragraphs; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
// 设置PDFBox选项,抑制字体警告
|
||||
|
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true"); |
||||
|
|
||||
|
// 打开PDF文档
|
||||
|
PDDocument document = PDDocument.load(pdfFile); |
||||
|
|
||||
|
// 设置PDFBox参数,提高对中文字体的兼容性
|
||||
|
document.setResourceCache(null); // 禁用资源缓存,可能减少某些字体问题
|
||||
|
|
||||
|
int totalPages = document.getNumberOfPages(); |
||||
|
|
||||
|
// 第一步:收集所有x坐标和重复文本
|
||||
|
List<Float> xCoordinates = new ArrayList<>(); |
||||
|
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
|
||||
|
|
||||
|
log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages); |
||||
|
|
||||
|
// 遍历每一页收集X坐标
|
||||
|
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
||||
|
try { |
||||
|
// 为每页创建文本提取器
|
||||
|
TextBlockStripper stripper = new TextBlockStripper(); |
||||
|
stripper.setSortByPosition(true); |
||||
|
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
||||
|
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
||||
|
stripper.setStartPage(pageIndex + 1); |
||||
|
stripper.setEndPage(pageIndex + 1); |
||||
|
stripper.getText(document); |
||||
|
|
||||
|
for (TextBlock block : stripper.getTextBlocks()) { |
||||
|
String text = block.getText().trim(); |
||||
|
if (text.length() > 0) { |
||||
|
// 统计X坐标
|
||||
|
if (block.getX() < maxIndentX) { |
||||
|
xCoordinates.add(block.getX()); |
||||
|
} |
||||
|
|
||||
|
// 统计文本频率
|
||||
|
if (text.length() >= minParagraphLength) { |
||||
|
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage()); |
||||
|
// 继续处理下一页,而不是中断整个过程
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (xCoordinates.isEmpty()) { |
||||
|
log.warn("未找到有效的X坐标,无法提取段落"); |
||||
|
document.close(); |
||||
|
return paragraphs; |
||||
|
} |
||||
|
|
||||
|
// 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容)
|
||||
|
int frequencyThreshold = totalPages / 2; |
||||
|
Set<String> frequentTexts = textFrequency.entrySet().stream() |
||||
|
.filter(entry -> entry.getValue() > frequencyThreshold) |
||||
|
.map(Map.Entry::getKey) |
||||
|
.collect(Collectors.toSet()); |
||||
|
|
||||
|
log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold); |
||||
|
|
||||
|
// 统计x坐标频率并找出前两名(通常是正常段落和首行缩进)
|
||||
|
Map<Float, Long> xCounter = xCoordinates.stream() |
||||
|
.collect(Collectors.groupingBy(x -> x, Collectors.counting())); |
||||
|
|
||||
|
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream() |
||||
|
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed()) |
||||
|
.limit(2) |
||||
|
.collect(Collectors.toList()); |
||||
|
|
||||
|
if (mostCommonX.size() < 2) { |
||||
|
log.warn("未找到足够的X坐标特征,无法区分段落缩进"); |
||||
|
document.close(); |
||||
|
return paragraphs; |
||||
|
} |
||||
|
|
||||
|
// 确保x_indent > x_normal
|
||||
|
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
|
||||
|
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
|
||||
|
|
||||
|
log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance); |
||||
|
|
||||
|
// 根据基准x坐标提取段落
|
||||
|
List<String> currentParagraph = new ArrayList<>(); |
||||
|
|
||||
|
// 逐页处理文本块
|
||||
|
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
||||
|
try { |
||||
|
List<TextBlock> pageTextBlocks = new ArrayList<>(); |
||||
|
|
||||
|
// 为每页创建文本提取器
|
||||
|
TextBlockStripper stripper = new TextBlockStripper(); |
||||
|
stripper.setSortByPosition(true); |
||||
|
stripper.setSuppressDuplicateOverlappingText(false); |
||||
|
stripper.setAddMoreFormatting(false); |
||||
|
stripper.setStartPage(pageIndex + 1); |
||||
|
stripper.setEndPage(pageIndex + 1); |
||||
|
stripper.getText(document); |
||||
|
|
||||
|
// 获取当前页的文本块并排序
|
||||
|
pageTextBlocks.addAll(stripper.getTextBlocks()); |
||||
|
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); |
||||
|
|
||||
|
// 处理当前页的文本块
|
||||
|
for (TextBlock block : pageTextBlocks) { |
||||
|
String lineText = block.getText().trim().replace('\n', ' ').trim(); |
||||
|
if (lineText.isEmpty()) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
// 过滤高频文本
|
||||
|
if (frequentTexts.contains(lineText)) { |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
float currentX = block.getX(); |
||||
|
|
||||
|
// 判断当前x坐标属于哪种类型
|
||||
|
boolean isIndent = Math.abs(currentX - xIndent) <= tolerance; |
||||
|
boolean isNormal = Math.abs(currentX - xNormal) <= tolerance; |
||||
|
|
||||
|
// 如果是缩进位置,说明是新段落的开始
|
||||
|
if (isIndent) { |
||||
|
if (!currentParagraph.isEmpty()) { |
||||
|
String paragraphText = String.join("", currentParagraph); |
||||
|
if (paragraphText.length() >= minParagraphLength) { |
||||
|
paragraphs.add(paragraphText.trim()); |
||||
|
} |
||||
|
currentParagraph.clear(); |
||||
|
} |
||||
|
if (lineText.length() >= minParagraphLength) { |
||||
|
currentParagraph.add(lineText); |
||||
|
} |
||||
|
} |
||||
|
// 如果是正常位置,追加到当前段落
|
||||
|
else if (isNormal) { |
||||
|
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
|
||||
|
currentParagraph.add(lineText); |
||||
|
} else { |
||||
|
currentParagraph.add(lineText); |
||||
|
} |
||||
|
} |
||||
|
// 如果既不是缩进也不是正常位置,作为独立段落
|
||||
|
else { |
||||
|
// 如果独立段落字数满足要求进行统计,不满足要求跳过
|
||||
|
if (lineText.length() >= minParagraphLength) { |
||||
|
if (!currentParagraph.isEmpty()) { |
||||
|
String paragraphText = String.join("", currentParagraph); |
||||
|
if (paragraphText.length() >= minParagraphLength) { |
||||
|
paragraphs.add(paragraphText.trim()); |
||||
|
} |
||||
|
currentParagraph.clear(); |
||||
|
} |
||||
|
paragraphs.add(lineText.trim()); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage()); |
||||
|
// 继续处理下一页
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 处理最后一个段落
|
||||
|
if (!currentParagraph.isEmpty()) { |
||||
|
String paragraphText = String.join("", currentParagraph); |
||||
|
if (paragraphText.length() >= minParagraphLength) { |
||||
|
paragraphs.add(paragraphText.trim()); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
document.close(); |
||||
|
log.info("PDF解析完成,提取段落数: {}", paragraphs.size()); |
||||
|
|
||||
|
} catch (IOException e) { |
||||
|
log.error("提取PDF段落失败: {}", e.getMessage(), e); |
||||
|
} |
||||
|
|
||||
|
return paragraphs; |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 检查PDF文件是否可被有效解析 |
||||
|
* |
||||
|
* @param filePath PDF文件路径 |
||||
|
* @return 是否可解析 |
||||
|
*/ |
||||
|
public static boolean isValidPdf(String filePath) { |
||||
|
File pdfFile = new File(filePath); |
||||
|
if (!pdfFile.exists() || !pdfFile.isFile()) { |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
try { |
||||
|
PDDocument document = PDDocument.load(pdfFile); |
||||
|
int pageCount = document.getNumberOfPages(); |
||||
|
document.close(); |
||||
|
return pageCount > 0; |
||||
|
} catch (Exception e) { |
||||
|
log.error("检查PDF有效性时出错: {}", e.getMessage()); |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 用于提取文本块的PDFTextStripper |
||||
|
*/ |
||||
|
private static class TextBlockStripper extends PDFTextStripper { |
||||
|
private final List<TextBlock> textBlocks = new ArrayList<>(); |
||||
|
private float lastY = -1; |
||||
|
private String currentLine = ""; |
||||
|
private float currentX = 0; |
||||
|
|
||||
|
public TextBlockStripper() throws IOException { |
||||
|
super(); |
||||
|
// 初始化
|
||||
|
textBlocks.clear(); |
||||
|
lastY = -1; |
||||
|
currentLine = ""; |
||||
|
currentX = 0; |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected void processTextPosition(TextPosition text) { |
||||
|
try { |
||||
|
float textX = text.getXDirAdj(); |
||||
|
float textY = text.getYDirAdj(); |
||||
|
|
||||
|
// 如果Y坐标变化超过一定阈值,认为是新行
|
||||
|
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { |
||||
|
// 保存上一行
|
||||
|
if (!currentLine.trim().isEmpty()) { |
||||
|
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
||||
|
} |
||||
|
|
||||
|
// 开始新行
|
||||
|
currentLine = text.getUnicode(); |
||||
|
currentX = textX; |
||||
|
lastY = textY; |
||||
|
} else { |
||||
|
// 在同一行,追加文本
|
||||
|
currentLine += text.getUnicode(); |
||||
|
} |
||||
|
} catch (Exception e) { |
||||
|
// 忽略单个字符处理错误,继续处理其他字符
|
||||
|
log.debug("处理文本位置时出错: {}", e.getMessage()); |
||||
|
} |
||||
|
|
||||
|
super.processTextPosition(text); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
protected void startPage(PDPage page) throws IOException { |
||||
|
// 清空textBlocks列表,避免累积所有页面的内容
|
||||
|
textBlocks.clear(); |
||||
|
lastY = -1; |
||||
|
currentLine = ""; |
||||
|
currentX = 0; |
||||
|
super.startPage(page); |
||||
|
} |
||||
|
|
||||
|
@Override |
||||
|
public void endDocument(PDDocument document) throws IOException { |
||||
|
// 保存最后一行
|
||||
|
if (!currentLine.trim().isEmpty()) { |
||||
|
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
||||
|
} |
||||
|
super.endDocument(document); |
||||
|
} |
||||
|
|
||||
|
public List<TextBlock> getTextBlocks() { |
||||
|
return textBlocks; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
/** |
||||
|
* 用于存储文本块信息的类 |
||||
|
*/ |
||||
|
private static class TextBlock { |
||||
|
private final float x; |
||||
|
private final float y; |
||||
|
private final String text; |
||||
|
|
||||
|
public TextBlock(float x, float y, String text) { |
||||
|
this.x = x; |
||||
|
this.y = y; |
||||
|
this.text = text; |
||||
|
} |
||||
|
|
||||
|
public float getX() { |
||||
|
return x; |
||||
|
} |
||||
|
|
||||
|
public float getY() { |
||||
|
return y; |
||||
|
} |
||||
|
|
||||
|
public String getText() { |
||||
|
return text; |
||||
|
} |
||||
|
} |
||||
|
} |
Loading…
Reference in new issue