12 changed files with 693 additions and 504 deletions
@ -1,332 +0,0 @@ |
|||||
package org.dromara.test; |
|
||||
|
|
||||
import org.apache.pdfbox.pdmodel.PDDocument; |
|
||||
import org.apache.pdfbox.pdmodel.PDPage; |
|
||||
import org.apache.pdfbox.text.PDFTextStripper; |
|
||||
import org.apache.pdfbox.text.TextPosition; |
|
||||
import org.junit.jupiter.api.DisplayName; |
|
||||
import org.junit.jupiter.api.Test; |
|
||||
|
|
||||
import java.io.File; |
|
||||
import java.io.IOException; |
|
||||
import java.util.*; |
|
||||
import java.util.logging.Logger; |
|
||||
import java.util.stream.Collectors; |
|
||||
|
|
||||
/** |
|
||||
* PDF段落提取测试 |
|
||||
*/ |
|
||||
@DisplayName("PDF段落提取测试") |
|
||||
public class PdfExtractorTest { |
|
||||
|
|
||||
private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName()); |
|
||||
|
|
||||
// 段落最小字数阈值
|
|
||||
private static final int MIN_PARAGRAPH_LENGTH = 20; |
|
||||
// 最大缩进值
|
|
||||
private static final float MAX_INDENT_X = 100f; |
|
||||
// 容差范围
|
|
||||
private static final float TOLERANCE = 2f; |
|
||||
|
|
||||
@Test |
|
||||
@DisplayName("测试PDF段落提取") |
|
||||
public void testExtractParagraphs() { |
|
||||
String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
|
|
||||
List<String> paragraphs = extractParagraphsFromPdf(pdfPath); |
|
||||
|
|
||||
System.out.println("提取段落总数: " + paragraphs.size()); |
|
||||
for (int i = 0; i < paragraphs.size(); i++) { |
|
||||
if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){ |
|
||||
System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim()); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** |
|
||||
* 从PDF文件中提取段落,基于x坐标统计来判断段落 |
|
||||
*/ |
|
||||
public List<String> extractParagraphsFromPdf(String filePath) { |
|
||||
List<String> paragraphs = new ArrayList<>(); |
|
||||
File pdfFile = new File(filePath); |
|
||||
|
|
||||
try { |
|
||||
// 打开PDF文档
|
|
||||
PDDocument document = PDDocument.load(pdfFile); |
|
||||
int totalPages = document.getNumberOfPages(); |
|
||||
|
|
||||
// 第一步:收集所有x坐标和重复文本
|
|
||||
List<Float> xCoordinates = new ArrayList<>(); |
|
||||
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
|
|
||||
|
|
||||
// 遍历每一页收集X坐标
|
|
||||
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
|
||||
final int currentPage = pageIndex; // 用于匿名类中引用
|
|
||||
|
|
||||
// 为每页创建文本提取器
|
|
||||
TextBlockStripper stripper = new TextBlockStripper(); |
|
||||
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
|
||||
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
|
||||
stripper.setStartPage(pageIndex + 1); |
|
||||
stripper.setEndPage(pageIndex + 1); |
|
||||
stripper.setSortByPosition(true); |
|
||||
stripper.getText(document); |
|
||||
|
|
||||
for (TextBlock block : stripper.getTextBlocks()) { |
|
||||
String text = block.getText().trim(); |
|
||||
if (text.length() > 0) { |
|
||||
// 统计X坐标
|
|
||||
if (block.getX() < MAX_INDENT_X) { |
|
||||
xCoordinates.add(block.getX()); |
|
||||
} |
|
||||
|
|
||||
// 统计文本频率
|
|
||||
if (text.length() >= MIN_PARAGRAPH_LENGTH) { |
|
||||
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
if (xCoordinates.isEmpty()) { |
|
||||
document.close(); |
|
||||
return paragraphs; |
|
||||
} |
|
||||
|
|
||||
// 找出频率超过页面数一半的文本
|
|
||||
int frequencyThreshold = totalPages / 2; |
|
||||
Set<String> frequentTexts = textFrequency.entrySet().stream() |
|
||||
.filter(entry -> entry.getValue() > frequencyThreshold) |
|
||||
.map(Map.Entry::getKey) |
|
||||
.collect(Collectors.toSet()); |
|
||||
|
|
||||
System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)"); |
|
||||
frequentTexts.forEach(text -> System.out.println("高频文本: " + |
|
||||
(text.length() > 50 ? text.substring(0, 47) + "..." : text) + |
|
||||
" 出现次数: " + textFrequency.get(text))); |
|
||||
|
|
||||
// 第二步:统计x坐标频率并找出前两名
|
|
||||
Map<Float, Long> xCounter = xCoordinates.stream() |
|
||||
.collect(Collectors.groupingBy(x -> x, Collectors.counting())); |
|
||||
|
|
||||
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream() |
|
||||
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed()) |
|
||||
.limit(2) |
|
||||
.collect(Collectors.toList()); |
|
||||
|
|
||||
if (mostCommonX.size() < 2) { |
|
||||
document.close(); |
|
||||
return paragraphs; |
|
||||
} |
|
||||
|
|
||||
// 确保x_indent > x_normal
|
|
||||
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
|
|
||||
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
|
|
||||
|
|
||||
System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE); |
|
||||
|
|
||||
// 第三步:根据基准x坐标提取段落
|
|
||||
List<String> currentParagraph = new ArrayList<>(); |
|
||||
int num=311; |
|
||||
|
|
||||
// 逐页处理文本块
|
|
||||
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) { |
|
||||
List<TextBlock> pageTextBlocks = new ArrayList<>(); |
|
||||
|
|
||||
// 为每页创建文本提取器
|
|
||||
TextBlockStripper stripper = new TextBlockStripper(); |
|
||||
stripper.setSortByPosition(true); |
|
||||
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
|
|
||||
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
|
|
||||
stripper.setStartPage(pageIndex + 1); |
|
||||
stripper.setEndPage(pageIndex + 1); |
|
||||
stripper.getText(document); |
|
||||
|
|
||||
// 获取当前页的文本块并排序
|
|
||||
pageTextBlocks.addAll(stripper.getTextBlocks()); |
|
||||
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY)); |
|
||||
|
|
||||
if(pageIndex==num){ |
|
||||
System.out.println(pageTextBlocks); |
|
||||
} |
|
||||
|
|
||||
// 处理当前页的文本块
|
|
||||
for (TextBlock block : pageTextBlocks) { |
|
||||
String lineText = block.getText().trim().replace('\n', ' ').trim(); |
|
||||
if (lineText.isEmpty()) { |
|
||||
continue; |
|
||||
} |
|
||||
|
|
||||
// 过滤高频文本
|
|
||||
if (frequentTexts.contains(lineText)) { |
|
||||
if (pageIndex == num) { |
|
||||
System.out.println("过滤高频文本: " + |
|
||||
(lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText)); |
|
||||
} |
|
||||
continue; |
|
||||
} |
|
||||
|
|
||||
float currentX = block.getX(); |
|
||||
|
|
||||
// 判断当前x坐标属于哪种类型
|
|
||||
boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE; |
|
||||
boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE; |
|
||||
|
|
||||
// 如果是缩进位置,说明是新段落的开始
|
|
||||
if (isIndent) { |
|
||||
if (!currentParagraph.isEmpty()) { |
|
||||
String paragraphText = String.join("", currentParagraph); |
|
||||
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
|
||||
paragraphs.add(paragraphText); |
|
||||
} |
|
||||
currentParagraph.clear(); |
|
||||
} |
|
||||
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { |
|
||||
currentParagraph.add(lineText); |
|
||||
} |
|
||||
} |
|
||||
// 如果是正常位置,追加到当前段落
|
|
||||
else if (isNormal) { |
|
||||
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
|
|
||||
currentParagraph.add(lineText); |
|
||||
} else { |
|
||||
currentParagraph.add(lineText); |
|
||||
} |
|
||||
} |
|
||||
// 如果既不是缩进也不是正常位置,作为独立段落
|
|
||||
else { |
|
||||
// 如果独立段落字数满足要求进行统计,不满足要求跳过
|
|
||||
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) { |
|
||||
if (!currentParagraph.isEmpty()) { |
|
||||
String paragraphText = String.join("", currentParagraph); |
|
||||
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
|
||||
paragraphs.add(paragraphText); |
|
||||
} |
|
||||
currentParagraph.clear(); |
|
||||
} |
|
||||
paragraphs.add(lineText); |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
// 处理最后一个段落
|
|
||||
if (!currentParagraph.isEmpty()) { |
|
||||
String paragraphText = String.join("", currentParagraph); |
|
||||
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) { |
|
||||
paragraphs.add(paragraphText); |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
document.close(); |
|
||||
|
|
||||
} catch (IOException e) { |
|
||||
logger.severe("提取PDF段落失败: " + e.getMessage()); |
|
||||
e.printStackTrace(); |
|
||||
} |
|
||||
|
|
||||
return paragraphs; |
|
||||
} |
|
||||
|
|
||||
/** |
|
||||
* 用于提取文本块的PDFTextStripper |
|
||||
*/ |
|
||||
private static class TextBlockStripper extends PDFTextStripper { |
|
||||
private final List<TextBlock> textBlocks = new ArrayList<>(); |
|
||||
private float lastY = -1; |
|
||||
private String currentLine = ""; |
|
||||
private float currentX = 0; |
|
||||
|
|
||||
public TextBlockStripper() throws IOException { |
|
||||
super(); |
|
||||
// 初始化
|
|
||||
textBlocks.clear(); |
|
||||
lastY = -1; |
|
||||
currentLine = ""; |
|
||||
currentX = 0; |
|
||||
} |
|
||||
|
|
||||
@Override |
|
||||
protected void processTextPosition(TextPosition text) { |
|
||||
float textX = text.getXDirAdj(); |
|
||||
float textY = text.getYDirAdj(); |
|
||||
float endX = text.getEndX(); |
|
||||
float endY = text.getEndY(); |
|
||||
// 如果Y坐标变化超过一定阈值,认为是新行
|
|
||||
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) { |
|
||||
// 保存上一行
|
|
||||
if (!currentLine.trim().isEmpty()) { |
|
||||
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
|
||||
} |
|
||||
|
|
||||
// 开始新行
|
|
||||
currentLine = text.getUnicode(); |
|
||||
currentX = textX; |
|
||||
lastY = textY; |
|
||||
} else { |
|
||||
// 在同一行,追加文本
|
|
||||
currentLine += text.getUnicode(); |
|
||||
} |
|
||||
|
|
||||
super.processTextPosition(text); |
|
||||
} |
|
||||
|
|
||||
@Override |
|
||||
protected void startPage(PDPage page) throws IOException { |
|
||||
// 清空textBlocks列表,避免累积所有页面的内容
|
|
||||
textBlocks.clear(); |
|
||||
lastY = -1; |
|
||||
currentLine = ""; |
|
||||
currentX = 0; |
|
||||
super.startPage(page); |
|
||||
} |
|
||||
|
|
||||
@Override |
|
||||
public void endDocument(PDDocument document) throws IOException { |
|
||||
// 保存最后一行
|
|
||||
if (!currentLine.trim().isEmpty()) { |
|
||||
textBlocks.add(new TextBlock(currentX, lastY, currentLine)); |
|
||||
} |
|
||||
super.endDocument(document); |
|
||||
} |
|
||||
|
|
||||
public List<TextBlock> getTextBlocks() { |
|
||||
return textBlocks; |
|
||||
} |
|
||||
} |
|
||||
|
|
||||
/** |
|
||||
* 用于存储文本块信息的类 |
|
||||
*/ |
|
||||
private static class TextBlock { |
|
||||
private final float x; |
|
||||
private final float y; |
|
||||
private final String text; |
|
||||
|
|
||||
public TextBlock(float x, float y, String text) { |
|
||||
this.x = x; |
|
||||
this.y = y; |
|
||||
this.text = text; |
|
||||
} |
|
||||
|
|
||||
public float getX() { |
|
||||
return x; |
|
||||
} |
|
||||
|
|
||||
public float getY() { |
|
||||
return y; |
|
||||
} |
|
||||
|
|
||||
public String getText() { |
|
||||
return text; |
|
||||
} |
|
||||
|
|
||||
@Override |
|
||||
public String toString() { |
|
||||
return "TextBlock{" + |
|
||||
"x=" + x + |
|
||||
", y=" + y + |
|
||||
", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' + |
|
||||
'}'; |
|
||||
} |
|
||||
} |
|
||||
} |
|
Loading…
Reference in new issue