因为水印内容一般是由倾斜度的,所以判断内容的倾斜度就可以去掉水印内容了。
PDFTextStripper.getText(document)底层是通过writeString(String text, List
public class MyPDFTextStripper extends PDFTextStripper {
/**
* Instantiate a new PDFTextStripper object.
*
* @throws IOException If there is an error loading the properties.
*/
public DepPDFTextStripper() throws IOException {
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
Iterator<TextPosition> iterator = textPositions.iterator();
while (iterator.hasNext()) {
TextPosition position = iterator.next();
int angle = getAngle(position);
if (Math.abs(angle) > 10) {
iterator.remove();
}
}
text = textPositions.stream().filter(elm->!Objects.isNull(elm)).map(TextPosition::getUnicode).collect(Collectors.joining());
super.writeString(text, textPositions);
}
/**
* 获取字体倾斜度
*
* @param text 当前字符对象
* @return 倾斜度
*/
public static int getAngle(TextPosition text) {
Matrix m = text.getTextMatrix().clone();
m.concatenate(text.getFont().getFontMatrix());
return (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
}
}
获取pdf内容
try (FileInputStream inputStream=new FileInputStream(new File("test.pdf"));PDDocument doc = PDDocument.load(inputStream)) {
PDFTextStripper textStripper = new DepPDFTextStripper();
int numberOfPages = doc.getNumberOfPages();
log.info("Current pdf have {} page", numberOfPages);
//解析的pdf内容
String psText = textStripper.getText(doc);
} catch (Exception e) {
e.printStackTrace();
log.info("parse PDF error {}", e.getMessage());
}
标签:iterator,text,水印,new,pdf,textPositions,解析,Math
From: https://www.cnblogs.com/lyuSky/p/17774999.html