import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
/**
-
@author
-
@title: aa
-
@projectName
-
@description: TODO
-
@date 2022/10/14 19:27
*/
public class OfficeTextSearch {private static final String SUCCESS = "success";
private static final String CONTINUE = "continue";
private static final String EXIT = "exit";
public static void main(String[] args) {
String pdfFileDir = "D:\用户\luofu704\Desktop\1031";
String excelFileDir = "D:\fintchFile\allexcel";
String wordFileDir = "D:\fintchFile\word";
//默认前后50个字符
int percount = 200;
int aftercount = 200;
//查找关键字
String beginKey = null;
//读取所有文件放到map中 key为path value 为 内容
LinkedHashMap<String, String> fileMap = new LinkedHashMap<>();
//读取pdf文件
//readPDFFileMap(fileMap,pdfFileDir);
//读取excel文件
//readExcelFileMap(fileMap,excelFileDir);
//读取word文件
readWordFileMap(fileMap,wordFileDir);
while(true){
Scanner scanner = new Scanner(System.in);
if(StringUtils.isEmpty(beginKey)){
System.err.println("请输入查找的关键字(或者输入exit退出):");
beginKey = scanner.nextLine();
}
if(StringUtils.isEmpty(beginKey)){
System.err.println("请输入查找的关键字,关键字不能为空:");
beginKey = scanner.nextLine();
}
if(EXIT.equals(beginKey)){
return;
}
if(StringUtils.isNotBlank(beginKey)){
findKeyContent(percount, aftercount, beginKey, fileMap);
}
//清除关键字
beginKey = null;
System.err.println(">>>>>>>>>>>>>>>>>>>>>>>>下一题<<<<<<<<<<<<<<<<<<<<<<<<<<<<");
}
}public static void findKeyContent(int percount, int aftercount, String beginKey, LinkedHashMap<String, String> fileMap) {
Set<Map.Entry<String, String>> entries = fileMap.entrySet();
for (Map.Entry<String, String> entry : entries) {
String message = getByStartAndEndKey(entry.getValue(), beginKey, entry.getKey(),percount,aftercount);
if(SUCCESS.equals(message)){
return;
}else if(CONTINUE.equals(message)) {
continue;
}
}
}
public static String getByStartAndEndKey(String content, String beginKey,String filePath,int percount,int aftercount) {
//beginKey 开始关键字字符串
int begin = content.indexOf(beginKey);
if (begin > 0) {
int strBegin = 0;
int strEnd = content.length();
//输出前后个字符串
if (begin > percount) {
//前10个字符
strBegin = begin - percount;
}
if (begin + aftercount < strEnd) {
strEnd = begin + aftercount;
}
String matchContent = content.substring(strBegin, strEnd);
System.out.println();
System.err.println("========================= 匹配文件: " + filePath + "================================================== ");
System.out.println(matchContent);
Scanner scanner = new Scanner(System.in);
System.err.println("是否继续往下查找y/n:");
String keyborad = scanner.nextLine();
if(StringUtils.isEmpty(keyborad)){
System.err.println("请输入y/n:");
keyborad = scanner.nextLine();
}
if ("Y".equals(keyborad.toUpperCase())) {
content = content.substring(begin + 1);
String byStartAndEndKey = getByStartAndEndKey(content, beginKey,filePath, percount, aftercount);
//多次搜索跳出循环
if(SUCCESS.equals(byStartAndEndKey)){
return SUCCESS;
}
} else if ("N".equals(keyborad.toUpperCase())) {
return SUCCESS;
}
//继续下一个文件的检索
return CONTINUE;
} else {
return CONTINUE;
}
}
private static Map readPDFFileMap(LinkedHashMap<String, String> fileMap, String fileDir) {
//pdf 解析
File file = new File(fileDir);
File[] files = file.listFiles();
PDDocument doc = null;
StringBuffer sb = new StringBuffer();
for (File item : files) {
try {
if (item.isDirectory()) {
continue;
}
String filePath = item.getAbsolutePath();
doc = PDDocument.load(new FileInputStream(filePath));
int numberOfPages = doc.getNumberOfPages();
PDFTextStripper pts = new PDFTextStripper();
pts.setSortByPosition(true);
for (int i = 1; i < numberOfPages+1; i++) {
pts.setStartPage(i);
pts.setEndPage(i);
String text = pts.getText(doc);
sb.append("文件:"+filePath+" 第"+i+"页:"+text).append("\n");
}
fileMap.put(filePath, sb.toString());
//System.out.println(sb.toString());
} catch (Exception e) {
System.err.println("##############文件解析异常:" + item.getName());
} finally {
try {
doc.close();
} catch (IOException e) {
System.err.println("##############文件流工具关闭失败!!!");
}
}
}
return fileMap;
}
private static Map readExcelFileMap(LinkedHashMap<String, String> fileMap, String fileDir) {
OPCPackage opcPackage = null;
//word 解析
File file = new File(fileDir);
File[] files = file.listFiles();
for (File item : files) {
try {
if (item.isDirectory()) {
continue;
}
String filePath = item.getAbsolutePath();
opcPackage = POIXMLDocument.openPackage(filePath);
XSSFExcelExtractor xe = new XSSFExcelExtractor(opcPackage);
xe.setFormulasNotResults(true);
xe.setIncludeSheetNames(true);
String content = xe.getText();;
fileMap.put(filePath, content);
} catch (Exception e) {
System.err.println("##############文件解析异常:" + item.getName());
} finally {
try {
opcPackage.close();
} catch (IOException e) {
System.err.println("##############文件流工具关闭失败!!!");
}
}
}
return fileMap;
}
private static Map readWordFileMap(LinkedHashMap<String, String> fileMap, String fileDir) {
OPCPackage opcPackage = null;
//word 解析
File file = new File(fileDir);
File[] files = file.listFiles();
for (File item : files) {
try {
if (item.isDirectory()) {
continue;
}
String filePath = item.getAbsolutePath();
opcPackage = POIXMLDocument.openPackage(filePath);
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(opcPackage);
String content = xwpfWordExtractor.getText();
fileMap.put(filePath, content);
} catch (Exception e) {
System.err.println("##############文件解析异常:" + item.getName());
} finally {
try {
opcPackage.close();
} catch (IOException e) {
System.err.println("##############文件流工具关闭失败!!!");
}
}
}
return fileMap;
}/*
com.itextpdf
itextpdf
5.5.13.2
jar
org.apache.pdfbox
pdfbox
2.0.9
org.apache.pdfbox
fontbox
2.0.9
org.apache.poi poi 4.1.1 org.apache.poi poi-ooxml 4.1.1 commons-lang commons-lang 2.6
}
标签:word,String,fileMap,beginKey,Excel,System,import,println,pdf From: https://www.cnblogs.com/leifonlyone/p/17829050.html