3 changed files with 713 additions and 0 deletions
@ -0,0 +1,332 @@ |
|||
package org.dromara.test; |
|||
|
|||
import org.apache.pdfbox.pdmodel.PDDocument; |
|||
import org.apache.pdfbox.pdmodel.PDPage; |
|||
import org.apache.pdfbox.text.PDFTextStripper; |
|||
import org.apache.pdfbox.text.TextPosition; |
|||
import org.junit.jupiter.api.DisplayName; |
|||
import org.junit.jupiter.api.Test; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.util.*; |
|||
import java.util.logging.Logger; |
|||
import java.util.stream.Collectors; |
|||
|
|||
/** |
|||
* PDF段落提取测试 |
|||
*/ |
|||
@DisplayName("PDF段落提取测试") |
|||
public class PdfExtractorTest { |
|||
|
|||
private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName()); |
|||
|
|||
// 段落最小字数阈值
|
|||
private static final int MIN_PARAGRAPH_LENGTH = 20; |
|||
// 最大缩进值
|
|||
private static final float MAX_INDENT_X = 100f; |
|||
// 容差范围
|
|||
private static final float TOLERANCE = 2f; |
|||
|
|||
@Test |
|||
@DisplayName("测试PDF段落提取") |
|||
public void testExtractParagraphs() { |
|||
String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
|
|||
List<String> paragraphs = extractParagraphsFromPdf(pdfPath); |
|||
|
|||
System.out.println("提取段落总数: " + paragraphs.size()); |
|||
for (int i = 0; i < paragraphs.size(); i++) { |
|||
if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){ |
|||
System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim()); |
|||
} |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 从PDF文件中提取段落,基于x坐标统计来判断段落 |
|||
*/ |
|||
public List<String> extractParagraphsFromPdf(String filePath) { |
|||
List<String> paragraphs = new ArrayList<>(); |
|||
File pdfFile = new File(filePath); |
|||
|
|||
try { |
|||
// 打开PDF文档
|
|||
PDDocument document = PDDocument.load(pdfFile); |
|||
int totalPages = document.getNumberOfPages(); |
|||
|
|||
// 第一步:收集所有x坐标和重复文本
|
|||
List<Float> xCoordinates = new ArrayList<>(); |
|||
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
|
|||
|
|||
// 遍历每一页收集X坐标
|
|||
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
|||
final int currentPage = pageIndex; // 用于匿名类中引用
|
|||
|
|||
// 为每页创建文本提取器
|
|||
TextBlockStripper stripper = new TextBlockStripper(); |
|||
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
|||
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
|||
stripper.setStartPage(pageIndex + 1); |
|||
stripper.setEndPage(pageIndex + 1); |
|||
stripper.setSortByPosition(true); |
|||
stripper.getText(document); |
|||
|
|||
for (TextBlock block : stripper.getTextBlocks()) { |
|||
String text = block.getText().trim(); |
|||
if (text.length() > 0) { |
|||
// 统计X坐标
|
|||
if (block.getX() < MAX_INDENT_X) { |
|||
xCoordinates.add(block.getX()); |
|||
} |
|||
|
|||
// 统计文本频率
|
|||
if (text.length() >= MIN_PARAGRAPH_LENGTH) { |
|||
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (xCoordinates.isEmpty()) { |
|||
document.close(); |
|||
return paragraphs; |
|||
} |
|||
|
|||
// 找出频率超过页面数一半的文本
|
|||
int frequencyThreshold = totalPages / 2; |
|||
Set<String> frequentTexts = textFrequency.entrySet().stream() |
|||
.filter(entry -> entry.getValue() > frequencyThreshold) |
|||
.map(Map.Entry::getKey) |
|||
.collect(Collectors.toSet()); |
|||
|
|||
System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)"); |
|||
frequentTexts.forEach(text -> System.out.println("高频文本: " + |
|||
(text.length() > 50 ? text.substring(0, 47) + "..." : text) + |
|||
" 出现次数: " + textFrequency.get(text))); |
|||
|
|||
// 第二步:统计x坐标频率并找出前两名
|
|||
Map<Float, Long> xCounter = xCoordinates.stream() |
|||
.collect(Collectors.groupingBy(x -> x, Collectors.counting())); |
|||
|
|||
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream() |
|||
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed()) |
|||
.limit(2) |
|||
.collect(Collectors.toList()); |
|||
|
|||
if (mostCommonX.size() < 2) { |
|||
document.close(); |
|||
return paragraphs; |
|||
} |
|||
|
|||
// 确保x_indent > x_normal
|
|||
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
|
|||
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
|
|||
|
|||
System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE); |
|||
|
|||
// 第三步:根据基准x坐标提取段落
|
|||
List<String> currentParagraph = new ArrayList<>(); |
|||
int num=311; |
|||
|
|||
// 逐页处理文本块
|
|||
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
|||
List<TextBlock> pageTextBlocks = new ArrayList<>(); |
|||
|
|||
// 为每页创建文本提取器
|
|||
TextBlockStripper stripper = new TextBlockStripper(); |
|||
stripper.setSortByPosition(true); |
|||
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
|||
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
|||
stripper.setStartPage(pageIndex + 1); |
|||
stripper.setEndPage(pageIndex + 1); |
|||
stripper.getText(document); |
|||
|
|||
// 获取当前页的文本块并排序
|
|||
pageTextBlocks.addAll(stripper.getTextBlocks()); |
|||
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); |
|||
|
|||
if(pageIndex==num){ |
|||
System.out.println(pageTextBlocks); |
|||
} |
|||
|
|||
// 处理当前页的文本块
|
|||
for (TextBlock block : pageTextBlocks) { |
|||
String lineText = block.getText().trim().replace('\n', ' ').trim(); |
|||
if (lineText.isEmpty()) { |
|||
continue; |
|||
} |
|||
|
|||
// 过滤高频文本
|
|||
if (frequentTexts.contains(lineText)) { |
|||
if (pageIndex == num) { |
|||
System.out.println("过滤高频文本: " + |
|||
(lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText)); |
|||
} |
|||
continue; |
|||
} |
|||
|
|||
float currentX = block.getX(); |
|||
|
|||
// 判断当前x坐标属于哪种类型
|
|||
boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE; |
|||
boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE; |
|||
|
|||
// 如果是缩进位置,说明是新段落的开始
|
|||
if (isIndent) { |
|||
if (!currentParagraph.isEmpty()) { |
|||
String paragraphText = String.join("", currentParagraph); |
|||
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
|||
paragraphs.add(paragraphText); |
|||
} |
|||
currentParagraph.clear(); |
|||
} |
|||
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { |
|||
currentParagraph.add(lineText); |
|||
} |
|||
} |
|||
// 如果是正常位置,追加到当前段落
|
|||
else if (isNormal) { |
|||
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
|
|||
currentParagraph.add(lineText); |
|||
} else { |
|||
currentParagraph.add(lineText); |
|||
} |
|||
} |
|||
// 如果既不是缩进也不是正常位置,作为独立段落
|
|||
else { |
|||
// 如果独立段落字数满足要求进行统计,不满足要求跳过
|
|||
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { |
|||
if (!currentParagraph.isEmpty()) { |
|||
String paragraphText = String.join("", currentParagraph); |
|||
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
|||
paragraphs.add(paragraphText); |
|||
} |
|||
currentParagraph.clear(); |
|||
} |
|||
paragraphs.add(lineText); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// 处理最后一个段落
|
|||
if (!currentParagraph.isEmpty()) { |
|||
String paragraphText = String.join("", currentParagraph); |
|||
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
|||
paragraphs.add(paragraphText); |
|||
} |
|||
} |
|||
|
|||
document.close(); |
|||
|
|||
} catch (IOException e) { |
|||
logger.severe("提取PDF段落失败: " + e.getMessage()); |
|||
e.printStackTrace(); |
|||
} |
|||
|
|||
return paragraphs; |
|||
} |
|||
|
|||
/** |
|||
* 用于提取文本块的PDFTextStripper |
|||
*/ |
|||
private static class TextBlockStripper extends PDFTextStripper { |
|||
private final List<TextBlock> textBlocks = new ArrayList<>(); |
|||
private float lastY = -1; |
|||
private String currentLine = ""; |
|||
private float currentX = 0; |
|||
|
|||
public TextBlockStripper() throws IOException { |
|||
super(); |
|||
// 初始化
|
|||
textBlocks.clear(); |
|||
lastY = -1; |
|||
currentLine = ""; |
|||
currentX = 0; |
|||
} |
|||
|
|||
@Override |
|||
protected void processTextPosition(TextPosition text) { |
|||
float textX = text.getXDirAdj(); |
|||
float textY = text.getYDirAdj(); |
|||
float endX = text.getEndX(); |
|||
float endY = text.getEndY(); |
|||
// 如果Y坐标变化超过一定阈值,认为是新行
|
|||
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { |
|||
// 保存上一行
|
|||
if (!currentLine.trim().isEmpty()) { |
|||
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
|||
} |
|||
|
|||
// 开始新行
|
|||
currentLine = text.getUnicode(); |
|||
currentX = textX; |
|||
lastY = textY; |
|||
} else { |
|||
// 在同一行,追加文本
|
|||
currentLine += text.getUnicode(); |
|||
} |
|||
|
|||
super.processTextPosition(text); |
|||
} |
|||
|
|||
@Override |
|||
protected void startPage(PDPage page) throws IOException { |
|||
// 清空textBlocks列表,避免累积所有页面的内容
|
|||
textBlocks.clear(); |
|||
lastY = -1; |
|||
currentLine = ""; |
|||
currentX = 0; |
|||
super.startPage(page); |
|||
} |
|||
|
|||
@Override |
|||
public void endDocument(PDDocument document) throws IOException { |
|||
// 保存最后一行
|
|||
if (!currentLine.trim().isEmpty()) { |
|||
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
|||
} |
|||
super.endDocument(document); |
|||
} |
|||
|
|||
public List<TextBlock> getTextBlocks() { |
|||
return textBlocks; |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 用于存储文本块信息的类 |
|||
*/ |
|||
private static class TextBlock { |
|||
private final float x; |
|||
private final float y; |
|||
private final String text; |
|||
|
|||
public TextBlock(float x, float y, String text) { |
|||
this.x = x; |
|||
this.y = y; |
|||
this.text = text; |
|||
} |
|||
|
|||
public float getX() { |
|||
return x; |
|||
} |
|||
|
|||
public float getY() { |
|||
return y; |
|||
} |
|||
|
|||
public String getText() { |
|||
return text; |
|||
} |
|||
|
|||
@Override |
|||
public String toString() { |
|||
return "TextBlock{" + |
|||
"x=" + x + |
|||
", y=" + y + |
|||
", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' + |
|||
'}'; |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,373 @@ |
|||
package org.dromara.productManagement.utils; |
|||
|
|||
import org.apache.pdfbox.pdmodel.PDDocument; |
|||
import org.apache.pdfbox.pdmodel.PDPage; |
|||
import org.apache.pdfbox.text.PDFTextStripper; |
|||
import org.apache.pdfbox.text.TextPosition; |
|||
import org.apache.pdfbox.rendering.ImageType; |
|||
import org.slf4j.Logger; |
|||
import org.slf4j.LoggerFactory; |
|||
|
|||
import java.io.File; |
|||
import java.io.IOException; |
|||
import java.util.*; |
|||
import java.util.stream.Collectors; |
|||
|
|||
/** |
|||
* PDF解析工具类 |
|||
* 用于提取PDF文档中的段落,支持首行缩进识别和高频文本过滤 |
|||
* // 使用默认参数
|
|||
* List<String> paragraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf"); |
|||
* |
|||
* // 使用自定义参数
|
|||
* List<String> customParagraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf", |
|||
* 30, // 最小段落长度
|
|||
* 120, // 最大缩进值
|
|||
* 3.0f // 容差范围
|
|||
* ); |
|||
*/ |
|||
public class PdfParserUtils { |
|||
|
|||
private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class); |
|||
|
|||
// 默认段落最小字数阈值
|
|||
private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20; |
|||
// 默认最大缩进值
|
|||
private static final float DEFAULT_MAX_INDENT_X = 100f; |
|||
// 默认容差范围
|
|||
private static final float DEFAULT_TOLERANCE = 2f; |
|||
|
|||
/** |
|||
* 从PDF文件中提取段落 |
|||
* |
|||
* @param filePath PDF文件路径 |
|||
* @return 提取的段落列表 |
|||
*/ |
|||
public static List<String> extractParagraphs(String filePath) { |
|||
return extractParagraphs(filePath, DEFAULT_MIN_PARAGRAPH_LENGTH, DEFAULT_MAX_INDENT_X, DEFAULT_TOLERANCE); |
|||
} |
|||
|
|||
/** |
|||
* 从PDF文件中提取段落,支持自定义参数 |
|||
* |
|||
* @param filePath PDF文件路径 |
|||
* @param minParagraphLength 最小段落长度 |
|||
* @param maxIndentX 最大缩进值 |
|||
* @param tolerance 容差范围 |
|||
* @return 提取的段落列表 |
|||
*/ |
|||
public static List<String> extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) { |
|||
List<String> paragraphs = new ArrayList<>(); |
|||
File pdfFile = new File(filePath); |
|||
|
|||
if (!pdfFile.exists() || !pdfFile.isFile()) { |
|||
log.error("PDF文件不存在: {}", filePath); |
|||
return paragraphs; |
|||
} |
|||
|
|||
try { |
|||
// 设置PDFBox选项,抑制字体警告
|
|||
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true"); |
|||
|
|||
// 打开PDF文档
|
|||
PDDocument document = PDDocument.load(pdfFile); |
|||
|
|||
// 设置PDFBox参数,提高对中文字体的兼容性
|
|||
document.setResourceCache(null); // 禁用资源缓存,可能减少某些字体问题
|
|||
|
|||
int totalPages = document.getNumberOfPages(); |
|||
|
|||
// 第一步:收集所有x坐标和重复文本
|
|||
List<Float> xCoordinates = new ArrayList<>(); |
|||
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
|
|||
|
|||
log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages); |
|||
|
|||
// 遍历每一页收集X坐标
|
|||
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
|||
try { |
|||
// 为每页创建文本提取器
|
|||
TextBlockStripper stripper = new TextBlockStripper(); |
|||
stripper.setSortByPosition(true); |
|||
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
|||
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
|||
stripper.setStartPage(pageIndex + 1); |
|||
stripper.setEndPage(pageIndex + 1); |
|||
stripper.getText(document); |
|||
|
|||
for (TextBlock block : stripper.getTextBlocks()) { |
|||
String text = block.getText().trim(); |
|||
if (text.length() > 0) { |
|||
// 统计X坐标
|
|||
if (block.getX() < maxIndentX) { |
|||
xCoordinates.add(block.getX()); |
|||
} |
|||
|
|||
// 统计文本频率
|
|||
if (text.length() >= minParagraphLength) { |
|||
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); |
|||
} |
|||
} |
|||
} |
|||
} catch (Exception e) { |
|||
log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage()); |
|||
// 继续处理下一页,而不是中断整个过程
|
|||
} |
|||
} |
|||
|
|||
if (xCoordinates.isEmpty()) { |
|||
log.warn("未找到有效的X坐标,无法提取段落"); |
|||
document.close(); |
|||
return paragraphs; |
|||
} |
|||
|
|||
// 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容)
|
|||
int frequencyThreshold = totalPages / 2; |
|||
Set<String> frequentTexts = textFrequency.entrySet().stream() |
|||
.filter(entry -> entry.getValue() > frequencyThreshold) |
|||
.map(Map.Entry::getKey) |
|||
.collect(Collectors.toSet()); |
|||
|
|||
log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold); |
|||
|
|||
// 统计x坐标频率并找出前两名(通常是正常段落和首行缩进)
|
|||
Map<Float, Long> xCounter = xCoordinates.stream() |
|||
.collect(Collectors.groupingBy(x -> x, Collectors.counting())); |
|||
|
|||
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream() |
|||
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed()) |
|||
.limit(2) |
|||
.collect(Collectors.toList()); |
|||
|
|||
if (mostCommonX.size() < 2) { |
|||
log.warn("未找到足够的X坐标特征,无法区分段落缩进"); |
|||
document.close(); |
|||
return paragraphs; |
|||
} |
|||
|
|||
// 确保x_indent > x_normal
|
|||
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
|
|||
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
|
|||
|
|||
log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance); |
|||
|
|||
// 根据基准x坐标提取段落
|
|||
List<String> currentParagraph = new ArrayList<>(); |
|||
|
|||
// 逐页处理文本块
|
|||
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
|||
try { |
|||
List<TextBlock> pageTextBlocks = new ArrayList<>(); |
|||
|
|||
// 为每页创建文本提取器
|
|||
TextBlockStripper stripper = new TextBlockStripper(); |
|||
stripper.setSortByPosition(true); |
|||
stripper.setSuppressDuplicateOverlappingText(false); |
|||
stripper.setAddMoreFormatting(false); |
|||
stripper.setStartPage(pageIndex + 1); |
|||
stripper.setEndPage(pageIndex + 1); |
|||
stripper.getText(document); |
|||
|
|||
// 获取当前页的文本块并排序
|
|||
pageTextBlocks.addAll(stripper.getTextBlocks()); |
|||
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); |
|||
|
|||
// 处理当前页的文本块
|
|||
for (TextBlock block : pageTextBlocks) { |
|||
String lineText = block.getText().trim().replace('\n', ' ').trim(); |
|||
if (lineText.isEmpty()) { |
|||
continue; |
|||
} |
|||
|
|||
// 过滤高频文本
|
|||
if (frequentTexts.contains(lineText)) { |
|||
continue; |
|||
} |
|||
|
|||
float currentX = block.getX(); |
|||
|
|||
// 判断当前x坐标属于哪种类型
|
|||
boolean isIndent = Math.abs(currentX - xIndent) <= tolerance; |
|||
boolean isNormal = Math.abs(currentX - xNormal) <= tolerance; |
|||
|
|||
// 如果是缩进位置,说明是新段落的开始
|
|||
if (isIndent) { |
|||
if (!currentParagraph.isEmpty()) { |
|||
String paragraphText = String.join("", currentParagraph); |
|||
if (paragraphText.length() >= minParagraphLength) { |
|||
paragraphs.add(paragraphText.trim()); |
|||
} |
|||
currentParagraph.clear(); |
|||
} |
|||
if (lineText.length() >= minParagraphLength) { |
|||
currentParagraph.add(lineText); |
|||
} |
|||
} |
|||
// 如果是正常位置,追加到当前段落
|
|||
else if (isNormal) { |
|||
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
|
|||
currentParagraph.add(lineText); |
|||
} else { |
|||
currentParagraph.add(lineText); |
|||
} |
|||
} |
|||
// 如果既不是缩进也不是正常位置,作为独立段落
|
|||
else { |
|||
// 如果独立段落字数满足要求进行统计,不满足要求跳过
|
|||
if (lineText.length() >= minParagraphLength) { |
|||
if (!currentParagraph.isEmpty()) { |
|||
String paragraphText = String.join("", currentParagraph); |
|||
if (paragraphText.length() >= minParagraphLength) { |
|||
paragraphs.add(paragraphText.trim()); |
|||
} |
|||
currentParagraph.clear(); |
|||
} |
|||
paragraphs.add(lineText.trim()); |
|||
} |
|||
} |
|||
} |
|||
} catch (Exception e) { |
|||
log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage()); |
|||
// 继续处理下一页
|
|||
} |
|||
} |
|||
|
|||
// 处理最后一个段落
|
|||
if (!currentParagraph.isEmpty()) { |
|||
String paragraphText = String.join("", currentParagraph); |
|||
if (paragraphText.length() >= minParagraphLength) { |
|||
paragraphs.add(paragraphText.trim()); |
|||
} |
|||
} |
|||
|
|||
document.close(); |
|||
log.info("PDF解析完成,提取段落数: {}", paragraphs.size()); |
|||
|
|||
} catch (IOException e) { |
|||
log.error("提取PDF段落失败: {}", e.getMessage(), e); |
|||
} |
|||
|
|||
return paragraphs; |
|||
} |
|||
|
|||
/** |
|||
* 检查PDF文件是否可被有效解析 |
|||
* |
|||
* @param filePath PDF文件路径 |
|||
* @return 是否可解析 |
|||
*/ |
|||
public static boolean isValidPdf(String filePath) { |
|||
File pdfFile = new File(filePath); |
|||
if (!pdfFile.exists() || !pdfFile.isFile()) { |
|||
return false; |
|||
} |
|||
|
|||
try { |
|||
PDDocument document = PDDocument.load(pdfFile); |
|||
int pageCount = document.getNumberOfPages(); |
|||
document.close(); |
|||
return pageCount > 0; |
|||
} catch (Exception e) { |
|||
log.error("检查PDF有效性时出错: {}", e.getMessage()); |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 用于提取文本块的PDFTextStripper |
|||
*/ |
|||
private static class TextBlockStripper extends PDFTextStripper { |
|||
private final List<TextBlock> textBlocks = new ArrayList<>(); |
|||
private float lastY = -1; |
|||
private String currentLine = ""; |
|||
private float currentX = 0; |
|||
|
|||
public TextBlockStripper() throws IOException { |
|||
super(); |
|||
// 初始化
|
|||
textBlocks.clear(); |
|||
lastY = -1; |
|||
currentLine = ""; |
|||
currentX = 0; |
|||
} |
|||
|
|||
@Override |
|||
protected void processTextPosition(TextPosition text) { |
|||
try { |
|||
float textX = text.getXDirAdj(); |
|||
float textY = text.getYDirAdj(); |
|||
|
|||
// 如果Y坐标变化超过一定阈值,认为是新行
|
|||
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { |
|||
// 保存上一行
|
|||
if (!currentLine.trim().isEmpty()) { |
|||
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
|||
} |
|||
|
|||
// 开始新行
|
|||
currentLine = text.getUnicode(); |
|||
currentX = textX; |
|||
lastY = textY; |
|||
} else { |
|||
// 在同一行,追加文本
|
|||
currentLine += text.getUnicode(); |
|||
} |
|||
} catch (Exception e) { |
|||
// 忽略单个字符处理错误,继续处理其他字符
|
|||
log.debug("处理文本位置时出错: {}", e.getMessage()); |
|||
} |
|||
|
|||
super.processTextPosition(text); |
|||
} |
|||
|
|||
@Override |
|||
protected void startPage(PDPage page) throws IOException { |
|||
// 清空textBlocks列表,避免累积所有页面的内容
|
|||
textBlocks.clear(); |
|||
lastY = -1; |
|||
currentLine = ""; |
|||
currentX = 0; |
|||
super.startPage(page); |
|||
} |
|||
|
|||
@Override |
|||
public void endDocument(PDDocument document) throws IOException { |
|||
// 保存最后一行
|
|||
if (!currentLine.trim().isEmpty()) { |
|||
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
|||
} |
|||
super.endDocument(document); |
|||
} |
|||
|
|||
public List<TextBlock> getTextBlocks() { |
|||
return textBlocks; |
|||
} |
|||
} |
|||
|
|||
/** |
|||
* 用于存储文本块信息的类 |
|||
*/ |
|||
private static class TextBlock { |
|||
private final float x; |
|||
private final float y; |
|||
private final String text; |
|||
|
|||
public TextBlock(float x, float y, String text) { |
|||
this.x = x; |
|||
this.y = y; |
|||
this.text = text; |
|||
} |
|||
|
|||
public float getX() { |
|||
return x; |
|||
} |
|||
|
|||
public float getY() { |
|||
return y; |
|||
} |
|||
|
|||
public String getText() { |
|||
return text; |
|||
} |
|||
} |
|||
} |
Loading…
Reference in new issue