Browse Source

sjj 功能更新与优化3

sjj_dev
zhouhaibin 4 days ago
parent
commit
e3d69d5974
  1. 8
      ruoyi-admin/pom.xml
  2. 332
      ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java
  3. 373
      zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java

8
ruoyi-admin/pom.xml

@ -197,6 +197,14 @@
<!-- <version>${与你的agent探针版本保持一致}</version>--> <!-- <version>${与你的agent探针版本保持一致}</version>-->
<!-- </dependency>--> <!-- </dependency>-->
<!-- PDFBox 依赖 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version>
<scope>test</scope>
</dependency>
</dependencies> </dependencies>
<build> <build>

332
ruoyi-admin/src/test/java/org/dromara/test/PdfExtractorTest.java

@ -0,0 +1,332 @@
package org.dromara.test;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.logging.Logger;
import java.util.stream.Collectors;
/**
* PDF段落提取测试
*/
@DisplayName("PDF段落提取测试")
public class PdfExtractorTest {
private static final Logger logger = Logger.getLogger(PdfExtractorTest.class.getName());
// 段落最小字数阈值
private static final int MIN_PARAGRAPH_LENGTH = 20;
// 最大缩进值
private static final float MAX_INDENT_X = 100f;
// 容差范围
private static final float TOLERANCE = 2f;
@Test
@DisplayName("测试PDF段落提取")
public void testExtractParagraphs() {
String pdfPath = "C:\\Users\\gy051\\Desktop\\商务技术响应文件-金丰印务.pdf"; // 替换为实际PDF路径
List<String> paragraphs = extractParagraphsFromPdf(pdfPath);
System.out.println("提取段落总数: " + paragraphs.size());
for (int i = 0; i < paragraphs.size(); i++) {
if(paragraphs.get(i).length() >= MIN_PARAGRAPH_LENGTH){
System.out.println("段落" + (i + 1) + ": " + paragraphs.get(i).trim());
}
}
}
/**
* 从PDF文件中提取段落基于x坐标统计来判断段落
*/
public List<String> extractParagraphsFromPdf(String filePath) {
List<String> paragraphs = new ArrayList<>();
File pdfFile = new File(filePath);
try {
// 打开PDF文档
PDDocument document = PDDocument.load(pdfFile);
int totalPages = document.getNumberOfPages();
// 第一步:收集所有x坐标和重复文本
List<Float> xCoordinates = new ArrayList<>();
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
// 遍历每一页收集X坐标
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
final int currentPage = pageIndex; // 用于匿名类中引用
// 为每页创建文本提取器
TextBlockStripper stripper = new TextBlockStripper();
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1);
stripper.setSortByPosition(true);
stripper.getText(document);
for (TextBlock block : stripper.getTextBlocks()) {
String text = block.getText().trim();
if (text.length() > 0) {
// 统计X坐标
if (block.getX() < MAX_INDENT_X) {
xCoordinates.add(block.getX());
}
// 统计文本频率
if (text.length() >= MIN_PARAGRAPH_LENGTH) {
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
}
}
}
}
if (xCoordinates.isEmpty()) {
document.close();
return paragraphs;
}
// 找出频率超过页面数一半的文本
int frequencyThreshold = totalPages / 2;
Set<String> frequentTexts = textFrequency.entrySet().stream()
.filter(entry -> entry.getValue() > frequencyThreshold)
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
System.out.println("发现" + frequentTexts.size() + "个高频文本(出现>" + frequencyThreshold + "次)");
frequentTexts.forEach(text -> System.out.println("高频文本: " +
(text.length() > 50 ? text.substring(0, 47) + "..." : text) +
" 出现次数: " + textFrequency.get(text)));
// 第二步:统计x坐标频率并找出前两名
Map<Float, Long> xCounter = xCoordinates.stream()
.collect(Collectors.groupingBy(x -> x, Collectors.counting()));
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream()
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed())
.limit(2)
.collect(Collectors.toList());
if (mostCommonX.size() < 2) {
document.close();
return paragraphs;
}
// 确保x_indent > x_normal
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
System.out.println("最终使用的坐标值:x_normal=" + xNormal + ", x_indent=" + xIndent + ", tolerance=" + TOLERANCE);
// 第三步:根据基准x坐标提取段落
List<String> currentParagraph = new ArrayList<>();
int num=311;
// 逐页处理文本块
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
List<TextBlock> pageTextBlocks = new ArrayList<>();
// 为每页创建文本提取器
TextBlockStripper stripper = new TextBlockStripper();
stripper.setSortByPosition(true);
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1);
stripper.getText(document);
// 获取当前页的文本块并排序
pageTextBlocks.addAll(stripper.getTextBlocks());
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
if(pageIndex==num){
System.out.println(pageTextBlocks);
}
// 处理当前页的文本块
for (TextBlock block : pageTextBlocks) {
String lineText = block.getText().trim().replace('\n', ' ').trim();
if (lineText.isEmpty()) {
continue;
}
// 过滤高频文本
if (frequentTexts.contains(lineText)) {
if (pageIndex == num) {
System.out.println("过滤高频文本: " +
(lineText.length() > 30 ? lineText.substring(0, 27) + "..." : lineText));
}
continue;
}
float currentX = block.getX();
// 判断当前x坐标属于哪种类型
boolean isIndent = Math.abs(currentX - xIndent) <= TOLERANCE;
boolean isNormal = Math.abs(currentX - xNormal) <= TOLERANCE;
// 如果是缩进位置,说明是新段落的开始
if (isIndent) {
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
paragraphs.add(paragraphText);
}
currentParagraph.clear();
}
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
currentParagraph.add(lineText);
}
}
// 如果是正常位置,追加到当前段落
else if (isNormal) {
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
currentParagraph.add(lineText);
} else {
currentParagraph.add(lineText);
}
}
// 如果既不是缩进也不是正常位置,作为独立段落
else {
// 如果独立段落字数满足要求进行统计,不满足要求跳过
if (lineText.length() >= MIN_PARAGRAPH_LENGTH) {
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
paragraphs.add(paragraphText);
}
currentParagraph.clear();
}
paragraphs.add(lineText);
}
}
}
}
// 处理最后一个段落
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= MIN_PARAGRAPH_LENGTH) {
paragraphs.add(paragraphText);
}
}
document.close();
} catch (IOException e) {
logger.severe("提取PDF段落失败: " + e.getMessage());
e.printStackTrace();
}
return paragraphs;
}
/**
* 用于提取文本块的PDFTextStripper
*/
private static class TextBlockStripper extends PDFTextStripper {
private final List<TextBlock> textBlocks = new ArrayList<>();
private float lastY = -1;
private String currentLine = "";
private float currentX = 0;
public TextBlockStripper() throws IOException {
super();
// 初始化
textBlocks.clear();
lastY = -1;
currentLine = "";
currentX = 0;
}
@Override
protected void processTextPosition(TextPosition text) {
float textX = text.getXDirAdj();
float textY = text.getYDirAdj();
float endX = text.getEndX();
float endY = text.getEndY();
// 如果Y坐标变化超过一定阈值,认为是新行
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
// 保存上一行
if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine));
}
// 开始新行
currentLine = text.getUnicode();
currentX = textX;
lastY = textY;
} else {
// 在同一行,追加文本
currentLine += text.getUnicode();
}
super.processTextPosition(text);
}
@Override
protected void startPage(PDPage page) throws IOException {
// 清空textBlocks列表,避免累积所有页面的内容
textBlocks.clear();
lastY = -1;
currentLine = "";
currentX = 0;
super.startPage(page);
}
@Override
public void endDocument(PDDocument document) throws IOException {
// 保存最后一行
if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine));
}
super.endDocument(document);
}
public List<TextBlock> getTextBlocks() {
return textBlocks;
}
}
/**
* 用于存储文本块信息的类
*/
private static class TextBlock {
private final float x;
private final float y;
private final String text;
public TextBlock(float x, float y, String text) {
this.x = x;
this.y = y;
this.text = text;
}
public float getX() {
return x;
}
public float getY() {
return y;
}
public String getText() {
return text;
}
@Override
public String toString() {
return "TextBlock{" +
"x=" + x +
", y=" + y +
", text='" + (text.length() > 30 ? text.substring(0, 27) + "..." : text) + '\'' +
'}';
}
}
}

373
zaojiaManagement/zaojia-productManagement/src/main/java/org/dromara/productManagement/utils/PdfParserUtils.java

@ -0,0 +1,373 @@
package org.dromara.productManagement.utils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.rendering.ImageType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
/**
* PDF解析工具类
* 用于提取PDF文档中的段落支持首行缩进识别和高频文本过滤
* // 使用默认参数
* List<String> paragraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf");
*
* // 使用自定义参数
* List<String> customParagraphs = PdfParserUtils.extractParagraphs("path/to/your.pdf",
* 30, // 最小段落长度
* 120, // 最大缩进值
* 3.0f // 容差范围
* );
*/
public class PdfParserUtils {
private static final Logger log = LoggerFactory.getLogger(PdfParserUtils.class);
// 默认段落最小字数阈值
private static final int DEFAULT_MIN_PARAGRAPH_LENGTH = 20;
// 默认最大缩进值
private static final float DEFAULT_MAX_INDENT_X = 100f;
// 默认容差范围
private static final float DEFAULT_TOLERANCE = 2f;
/**
* 从PDF文件中提取段落
*
* @param filePath PDF文件路径
* @return 提取的段落列表
*/
public static List<String> extractParagraphs(String filePath) {
return extractParagraphs(filePath, DEFAULT_MIN_PARAGRAPH_LENGTH, DEFAULT_MAX_INDENT_X, DEFAULT_TOLERANCE);
}
/**
* 从PDF文件中提取段落支持自定义参数
*
* @param filePath PDF文件路径
* @param minParagraphLength 最小段落长度
* @param maxIndentX 最大缩进值
* @param tolerance 容差范围
* @return 提取的段落列表
*/
public static List<String> extractParagraphs(String filePath, int minParagraphLength, float maxIndentX, float tolerance) {
List<String> paragraphs = new ArrayList<>();
File pdfFile = new File(filePath);
if (!pdfFile.exists() || !pdfFile.isFile()) {
log.error("PDF文件不存在: {}", filePath);
return paragraphs;
}
try {
// 设置PDFBox选项,抑制字体警告
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
// 打开PDF文档
PDDocument document = PDDocument.load(pdfFile);
// 设置PDFBox参数,提高对中文字体的兼容性
document.setResourceCache(null); // 禁用资源缓存,可能减少某些字体问题
int totalPages = document.getNumberOfPages();
// 第一步:收集所有x坐标和重复文本
List<Float> xCoordinates = new ArrayList<>();
Map<String, Integer> textFrequency = new HashMap<>(); // 记录文本出现频率
log.info("开始解析PDF文件: {}, 总页数: {}", filePath, totalPages);
// 遍历每一页收集X坐标
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
try {
// 为每页创建文本提取器
TextBlockStripper stripper = new TextBlockStripper();
stripper.setSortByPosition(true);
stripper.setSuppressDuplicateOverlappingText(false); // 不要抑制重叠文本,这可能导致中文文本丢失
stripper.setAddMoreFormatting(false); // 减少格式化可能提高中文处理的准确性
stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1);
stripper.getText(document);
for (TextBlock block : stripper.getTextBlocks()) {
String text = block.getText().trim();
if (text.length() > 0) {
// 统计X坐标
if (block.getX() < maxIndentX) {
xCoordinates.add(block.getX());
}
// 统计文本频率
if (text.length() >= minParagraphLength) {
textFrequency.put(text, textFrequency.getOrDefault(text, 0) + 1);
}
}
}
} catch (Exception e) {
log.warn("处理第{}页时发生错误: {}", pageIndex + 1, e.getMessage());
// 继续处理下一页,而不是中断整个过程
}
}
if (xCoordinates.isEmpty()) {
log.warn("未找到有效的X坐标,无法提取段落");
document.close();
return paragraphs;
}
// 找出频率超过页面数一半的文本(通常是页眉页脚等重复内容)
int frequencyThreshold = totalPages / 2;
Set<String> frequentTexts = textFrequency.entrySet().stream()
.filter(entry -> entry.getValue() > frequencyThreshold)
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
log.info("发现{}个高频文本(出现>{}次)", frequentTexts.size(), frequencyThreshold);
// 统计x坐标频率并找出前两名(通常是正常段落和首行缩进)
Map<Float, Long> xCounter = xCoordinates.stream()
.collect(Collectors.groupingBy(x -> x, Collectors.counting()));
List<Map.Entry<Float, Long>> mostCommonX = xCounter.entrySet().stream()
.sorted(Map.Entry.<Float, Long>comparingByValue().reversed())
.limit(2)
.collect(Collectors.toList());
if (mostCommonX.size() < 2) {
log.warn("未找到足够的X坐标特征,无法区分段落缩进");
document.close();
return paragraphs;
}
// 确保x_indent > x_normal
float xNormal = Math.min(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 无缩进坐标
float xIndent = Math.max(mostCommonX.get(0).getKey(), mostCommonX.get(1).getKey()); // 缩进坐标
log.info("使用的坐标值:x_normal={}, x_indent={}, tolerance={}", xNormal, xIndent, tolerance);
// 根据基准x坐标提取段落
List<String> currentParagraph = new ArrayList<>();
// 逐页处理文本块
for (int pageIndex = 0; pageIndex < totalPages; pageIndex++) {
try {
List<TextBlock> pageTextBlocks = new ArrayList<>();
// 为每页创建文本提取器
TextBlockStripper stripper = new TextBlockStripper();
stripper.setSortByPosition(true);
stripper.setSuppressDuplicateOverlappingText(false);
stripper.setAddMoreFormatting(false);
stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1);
stripper.getText(document);
// 获取当前页的文本块并排序
pageTextBlocks.addAll(stripper.getTextBlocks());
pageTextBlocks.sort(Comparator.comparing(TextBlock::getY));
// 处理当前页的文本块
for (TextBlock block : pageTextBlocks) {
String lineText = block.getText().trim().replace('\n', ' ').trim();
if (lineText.isEmpty()) {
continue;
}
// 过滤高频文本
if (frequentTexts.contains(lineText)) {
continue;
}
float currentX = block.getX();
// 判断当前x坐标属于哪种类型
boolean isIndent = Math.abs(currentX - xIndent) <= tolerance;
boolean isNormal = Math.abs(currentX - xNormal) <= tolerance;
// 如果是缩进位置,说明是新段落的开始
if (isIndent) {
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) {
paragraphs.add(paragraphText.trim());
}
currentParagraph.clear();
}
if (lineText.length() >= minParagraphLength) {
currentParagraph.add(lineText);
}
}
// 如果是正常位置,追加到当前段落
else if (isNormal) {
if (currentParagraph.isEmpty()) { // 如果还没有段落,创建新段落
currentParagraph.add(lineText);
} else {
currentParagraph.add(lineText);
}
}
// 如果既不是缩进也不是正常位置,作为独立段落
else {
// 如果独立段落字数满足要求进行统计,不满足要求跳过
if (lineText.length() >= minParagraphLength) {
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) {
paragraphs.add(paragraphText.trim());
}
currentParagraph.clear();
}
paragraphs.add(lineText.trim());
}
}
}
} catch (Exception e) {
log.warn("处理第{}页段落提取时发生错误: {}", pageIndex + 1, e.getMessage());
// 继续处理下一页
}
}
// 处理最后一个段落
if (!currentParagraph.isEmpty()) {
String paragraphText = String.join("", currentParagraph);
if (paragraphText.length() >= minParagraphLength) {
paragraphs.add(paragraphText.trim());
}
}
document.close();
log.info("PDF解析完成,提取段落数: {}", paragraphs.size());
} catch (IOException e) {
log.error("提取PDF段落失败: {}", e.getMessage(), e);
}
return paragraphs;
}
/**
* 检查PDF文件是否可被有效解析
*
* @param filePath PDF文件路径
* @return 是否可解析
*/
public static boolean isValidPdf(String filePath) {
File pdfFile = new File(filePath);
if (!pdfFile.exists() || !pdfFile.isFile()) {
return false;
}
try {
PDDocument document = PDDocument.load(pdfFile);
int pageCount = document.getNumberOfPages();
document.close();
return pageCount > 0;
} catch (Exception e) {
log.error("检查PDF有效性时出错: {}", e.getMessage());
return false;
}
}
/**
* 用于提取文本块的PDFTextStripper
*/
private static class TextBlockStripper extends PDFTextStripper {
private final List<TextBlock> textBlocks = new ArrayList<>();
private float lastY = -1;
private String currentLine = "";
private float currentX = 0;
public TextBlockStripper() throws IOException {
super();
// 初始化
textBlocks.clear();
lastY = -1;
currentLine = "";
currentX = 0;
}
@Override
protected void processTextPosition(TextPosition text) {
try {
float textX = text.getXDirAdj();
float textY = text.getYDirAdj();
// 如果Y坐标变化超过一定阈值,认为是新行
if (lastY == -1 || Math.abs(textY - lastY) > text.getFontSizeInPt() * 0.5) {
// 保存上一行
if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine));
}
// 开始新行
currentLine = text.getUnicode();
currentX = textX;
lastY = textY;
} else {
// 在同一行,追加文本
currentLine += text.getUnicode();
}
} catch (Exception e) {
// 忽略单个字符处理错误,继续处理其他字符
log.debug("处理文本位置时出错: {}", e.getMessage());
}
super.processTextPosition(text);
}
@Override
protected void startPage(PDPage page) throws IOException {
// 清空textBlocks列表,避免累积所有页面的内容
textBlocks.clear();
lastY = -1;
currentLine = "";
currentX = 0;
super.startPage(page);
}
@Override
public void endDocument(PDDocument document) throws IOException {
// 保存最后一行
if (!currentLine.trim().isEmpty()) {
textBlocks.add(new TextBlock(currentX, lastY, currentLine));
}
super.endDocument(document);
}
public List<TextBlock> getTextBlocks() {
return textBlocks;
}
}
/**
* 用于存储文本块信息的类
*/
private static class TextBlock {
private final float x;
private final float y;
private final String text;
public TextBlock(float x, float y, String text) {
this.x = x;
this.y = y;
this.text = text;
}
public float getX() {
return x;
}
public float getY() {
return y;
}
public String getText() {
return text;
}
}
}
Loading…
Cancel
Save