pom文件引入
<repositories>
<repository>
<id>com.e-iceblue</id>
<url>http://repo.e-iceblue.cn/repository/maven-public/</url>
</repository>
</repositories>
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.presentation.free</artifactId>
<version>3.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.23</version>
</dependency>
工具类
package com.soft.mpms.zframe.common;
import com.aspose.words.Document;
import com.aspose.words.HtmlSaveOptions;
import com.aspose.words.SaveFormat;
import com.soft.mpms.basebean.mima.JwtUserObject;
import com.soft.mpms.minio.bean.FileObject;
import com.soft.mpms.minio.common.FileTypeConstant;
import com.soft.mpms.minio.service.SsoFileCommand;
import com.soft.mpms.zframe.config.FilePathConfig;
import com.soft.mpms.zframe.config.PdfConvertHtmlUtil;
import com.spire.presentation.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.util.ObjectUtils;
import org.springframework.web.multipart.MultipartFile;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Base64;
import java.util.HashMap;
import java.util.UUID;
/**
* @ClassName ChangeFileToHtml
* @description: TODO
* @author: hhs
* @Date 2022/11/14 0014 上午 8:48
**/
public class ChangeFileToHtml {
private static SsoFileCommand sso = new SsoFileCommand();
/**
* *methodName: pdfSaveHtml
*
* @return
* @Author: hhs
* @Description: pdf转换为html(均为图片格式)并提取第一张pdf的文字
* @Date: 2022/11/14 上午 8:54
* @Param: * @param null
**/
public static HashMap<String, String> pdfSaveHtml(MultipartFile multipartFile, String zjId, JwtUserObject jwtUserObject) {
HashMap<String, String> map = new HashMap<>();
String context = "";
try {
String htmlPath = FilePathConfig.gettempfilepath() + zjId + ".html";
BufferedImage bufferedImage = null;
//保存图片
bufferedImage = PdfConvertHtmlUtil.pdfStreamToPng(multipartFile.getInputStream(),zjId);
//转换
String base64_png = PdfConvertHtmlUtil.bufferedImageToBase64(bufferedImage);
//保存html文件
String html = PdfConvertHtmlUtil.createHtmlByBase64(base64_png, htmlPath, "pdf");
map.put("html", html);
File file = new File(FilePathConfig.gettempfilepath() + zjId + ".jpeg");
File pdffile = new File(FilePathConfig.gettempfilepath() + zjId + ".pdf");
FileUtils.copyInputStreamToFile(multipartFile.getInputStream(), pdffile);
//取摘要
Tika tika = new Tika();
context = tika.parseToString(pdffile);
context = context.replaceAll("\n", "");
context = context.replaceAll(" ", "");
if (context.length() >= 200) {
context = context.substring(0, 200);
}
if(file.exists()){
//上传第一张图片
uploadImgs(file,zjId,jwtUserObject);
}
//删除pdf文件
if (pdffile.exists()){pdffile.delete();}
//删除html文件
File htmlFile = new File(htmlPath);
if (htmlFile.exists()){ htmlFile.delete(); }
} catch (Exception e) {
e.printStackTrace();
}
map.put("remark", context);
return map;
}
/**
* *methodName: pptToHtml
*
* @return
* @Author: hhs
* @Description: ppt转换为html,上传第一张ppt,并提取文字
* @Date: 2022/11/14 上午 8:55
* @Param: * @param null
**/
public static HashMap<String, String> pptToHtml(MultipartFile file, JwtUserObject jwtUserObject, String zjId) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
HashMap<String, String> map = new HashMap<>();
String remark = "";
try {
// 将文件保存在服务器目录中
// 新生成的文件名称
String uuid = UUID.randomUUID().toString().replaceAll("-", "");
// 得到上传文件后缀
String originalName = file.getOriginalFilename();
String ext = "." + FilenameUtils.getExtension(originalName);
// 新生成的文件名称
String fileName = uuid + ext;
// 得到新的文件File对象
File targetFile = new File(FilePathConfig.gettempfilepath(), fileName);
// 开始复制文件
FileUtils.writeByteArrayToFile(targetFile, file.getBytes());
////加载幻灯片文档,创建Presentation类的对象
Presentation ppt = new Presentation();
ppt.loadFromFile(FilePathConfig.gettempfilepath() + fileName);
//转换为html
baos = new ByteArrayOutputStream();
ppt.saveToFile(baos, FileFormat.HTML);
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(baos.toString());
//取第一张幻灯片上传到服务器
BufferedImage image = ppt.getSlides().get(0).saveAsImage();
File imgFile = new File(uuid + ".jpeg");
ImageIO.write(image, "jpeg", imgFile);
//转换为MultipartFile
MultipartFile multipartFile = new MockMultipartFile(imgFile.getName(), imgFile.getName(), "", new FileInputStream(imgFile));
//上传到服务器
//上传图片
if (!ObjectUtils.isEmpty(multipartFile) && multipartFile.getSize() > 0) {
//上传第一张
sso.uploadFileCommond(multipartFile, jwtUserObject.getUSERID(), jwtUserObject.getUSERNAME(),
jwtUserObject.getITEMCODE(), jwtUserObject.getORGCODE(), "10036", jwtUserObject.getUSERIDENTITY(),
zjId, jwtUserObject.getITEMCODE().toLowerCase(), FileTypeConstant.COMP_10 + "/" + "10036" + "/", "");
}
map.put("html", htmlDoc.toString());
//处理摘要
StringBuilder buffer = new StringBuilder();
for (Object slide : ppt.getSlides()) {
for (Object shape : ((ISlide) slide).getShapes()) {
if (shape instanceof IAutoShape) {
for (Object tp : ((IAutoShape) shape).getTextFrame().getParagraphs()) {
buffer.append(((ParagraphEx) tp).getText());
}
}
}
}
if (buffer.length() >= 200) {
remark = buffer.substring(0, 200);
} else {
remark = buffer.toString();
}
map.put("remark", remark);
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
baos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return map;
}
/**
* *methodName: docSaveHtml
*
* @return
* @Author: hhs
* @Description: word转换为html,上传全部图片并提取文字
* @Date: 2022/11/14 上午 8:55
* @Param: * @param null
**/
public static String docSaveHtml(String zjId, JwtUserObject jwtUserObject, Document doc) {
org.jsoup.nodes.Document htmlDoc = null;
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
try {
// 2.设置doc转html配置
HtmlSaveOptions saveOptions = new HtmlSaveOptions(SaveFormat.HTML);
saveOptions.setExportTextInputFormFieldAsText(false);
saveOptions.setExportImagesAsBase64(true);
saveOptions.setPrettyFormat(true);
//高质量渲染
saveOptions.setUseHighQualityRendering(true);
// 3.内存流保存
doc.save(outputStream, saveOptions);
htmlDoc = Jsoup.parse(outputStream.toString());
if (htmlDoc != null) {
Elements elements = htmlDoc.getElementsByTag("img");
for (Element element : elements) {
String oldSrc = element.attr("src");
File file = base64ToFile(oldSrc);
String newSrc = uploadImgs(file, zjId, jwtUserObject);
//上传
element.attr("src", newSrc);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (outputStream != null) {
outputStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return htmlDoc.toString();
}
/*
* base64图片转换为本地图片
*/
private static File base64ToFile(String base64) throws Exception {
if (base64.contains("data:image")) {
base64 = base64.substring(base64.indexOf(",") + 1);
}
base64 = base64.toString().replace("\r\n", "");
//创建文件目录
String prefix = ".jpeg";
File file1 = new File(FilePathConfig.gettempfilepath());
String uid = UUID.randomUUID().toString().replaceAll("-", "");
System.out.println(uid);
File file = File.createTempFile(uid, prefix, file1);
BufferedOutputStream bos = null;
FileOutputStream fos = null;
try {
Base64.Decoder decoder = Base64.getDecoder();
byte[] bytes = decoder.decode(base64);
fos = new FileOutputStream(file);
bos = new BufferedOutputStream(fos);
bos.write(bytes);
} finally {
if (bos != null) {
try {
bos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fos != null) {
try {
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return file;
}
/*
* 上传附件
*/
private static String uploadImgs(File file, String zjId, JwtUserObject jwtUserObject) {
String imgPath = "";
try {
MultipartFile multipartFile = new MockMultipartFile(file.getName(), file.getName(), "", new FileInputStream(file));
//上传正文附件
FileObject fileObject = sso.uploadFileCommond(multipartFile, jwtUserObject.getUSERID(), jwtUserObject.getUSERNAME(),
jwtUserObject.getITEMCODE(), jwtUserObject.getORGCODE(), "10036", jwtUserObject.getUSERIDENTITY(),
zjId, jwtUserObject.getITEMCODE().toLowerCase(), FileTypeConstant.COMP_10 + "/" + "10036" + "/", "");
//返回路径
imgPath = sso.queryFileTempFilePathFileByid(fileObject.getFileid());
//删除文件
file.delete();
} catch (Exception e) {
e.printStackTrace();
}
return imgPath;
}
}
pdf处理
package com.soft.mpms.zframe.config;
import cn.hutool.core.lang.UUID;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import sun.misc.BASE64Encoder;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.awt.image.ColorModel;
import java.awt.image.WritableRaster;
import java.io.*;
/**
* PDF文档转HTML文档工具类
*/
@Slf4j
public class PdfConvertHtmlUtil {
/**
* PDF文档流转Png
*
* @param input
* @return BufferedImage
*/
public static BufferedImage pdfStreamToPng(byte[] input,String uid) {
PDDocument doc = null;
PDFRenderer renderer = null;
try {
doc = PDDocument.load(input);
renderer = new PDFRenderer(doc);
int pageCount = doc.getNumberOfPages();
BufferedImage image = null;
for (int i = 0; i < pageCount; i++) {
if (image != null) {
image = combineBufferedImages(image, renderer.renderImage(i));
}
if (i == 0) {
// 设置图片的分辨率
image = renderer.renderImage(i); // Windows native DPI
// 如果是PNG图片想要背景透明的话使用下面这个
// BufferedImage image = render.renderImageWithDPI(i, 296, ImageType.ARGB);
}
}
// BufferedImage srcImage = resize(image, 240, 240);//产生缩略图
return combineBufferedImages(image);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (doc != null) {
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* PDF文档流转Png
*
* @param pdfFileInputStream
* @return BufferedImage
*/
public static BufferedImage pdfStreamToPng(InputStream pdfFileInputStream,String uid) {
PDDocument doc = null;
PDFRenderer renderer = null;
try {
doc = PDDocument.load(pdfFileInputStream);
renderer = new PDFRenderer(doc);
int pageCount = doc.getNumberOfPages();
BufferedImage image = null;
for (int i = 0; i < pageCount; i++) {
if (image != null) {
image = combineBufferedImages(image, renderer.renderImageWithDPI(i, 144));
}
if (i == 0) {
image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
//保存第一张
String img = FilePathConfig.gettempfilepath()+uid+ ".jpeg";
ImageIO.write(image, "jpeg", new File(img));
}
// BufferedImage srcImage = resize(image, 240, 240);//产生缩略图
}
return combineBufferedImages(image);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (doc != null) {
doc.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/**
* 压缩图片
*
* @param source
* @param targetW
* @param targetH
* @return
*/
private static BufferedImage resize(BufferedImage source, int targetW, int targetH) {
int type = source.getType();
BufferedImage target = null;
double sx = (double) targetW / source.getWidth();
double sy = (double) targetH / source.getHeight();
if (sx > sy) {
sx = sy;
targetW = (int) (sx * source.getWidth());
} else {
sy = sx;
targetH = (int) (sy * source.getHeight());
}
if (type == BufferedImage.TYPE_CUSTOM) {
ColorModel cm = source.getColorModel();
WritableRaster raster = cm.createCompatibleWritableRaster(targetW, targetH);
boolean alphaPremultiplied = cm.isAlphaPremultiplied();
target = new BufferedImage(cm, raster, alphaPremultiplied, null);
} else {
target = new BufferedImage(targetW, targetH, type);
}
Graphics2D g = target.createGraphics();
g.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY);
g.drawRenderedImage(source, AffineTransform.getScaleInstance(sx, sy));
g.dispose();
return target;
}
/**
* BufferedImage拼接处理,添加分割线
*
* @param images
* @return BufferedImage
*/
public static BufferedImage combineBufferedImages(BufferedImage... images) {
int height = 0;
int width = 0;
for (BufferedImage image : images) {
//height += Math.max(height, image.getHeight());
height += image.getHeight();
width = image.getWidth();
}
BufferedImage combo = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);
Graphics2D g2 = combo.createGraphics();
int x = 0;
int y = 0;
for (BufferedImage image : images) {
//int y = (height - image.getHeight()) / 2;
g2.setStroke(new BasicStroke(2.0f));// 线条粗细
g2.setColor(new Color(193, 193, 193));// 线条颜色
g2.drawLine(x, y, width, y);// 线条起点及终点位置
g2.drawImage(image, x, y, null);
//x += image.getWidth();
y += image.getHeight();
}
return combo;
}
/**
* 通过Base64创建HTML文件并输出html文件
*
* @param base64
* @param htmlPath html保存路径
* @param title html标题
*/
public static String createHtmlByBase64(String base64, String htmlPath, String title) {
PrintStream printStream = null;
try {
// 打开文件
printStream = new PrintStream(new FileOutputStream(htmlPath));
} catch (FileNotFoundException e) {
log.error("create file error!", e);
return "";
}
try {
// 将HTML文件内容写入文件中
String htmlString = getHtmlString(base64, title);
printStream.println(htmlString);
return htmlString;
} catch (Exception e) {
log.error("createHtmlByBase64 error!", e);
} finally {
printStream.close();
}
return "";
}
/**
* 通过Base64创建HTML文件并输出html文件
*
* @param base64
*/
public static String getHtmlString(String base64, String title) {
StringBuilder stringHtml = new StringBuilder();
// 输入HTML文件内容
stringHtml.append("<html><head>");
stringHtml.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">");
stringHtml.append("<title>").append(title).append("</title>");
stringHtml.append("</head>");
stringHtml.append("<body style=\"" + "text-align: center;\">");
stringHtml.append("<img style=\""+"width: 100%;\""+ "src=\"data:image/png;base64,").append(base64).append("\"/>");
stringHtml.append("</body></html>");
return stringHtml.toString();
}
/**
* bufferedImage 转为 base64编码
*
* @param bufferedImage
* @return
*/
public static String bufferedImageToBase64(BufferedImage bufferedImage) {
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
String png_base64 = "";
try {
ImageIO.write(bufferedImage, "png", byteArrayOutputStream);// 写入流中
byte[] bytes = byteArrayOutputStream.toByteArray();// 转换成字节
BASE64Encoder encoder = new BASE64Encoder();
// 转换成base64串 删除 \r\n
png_base64 = encoder.encodeBuffer(bytes).trim()
.replaceAll("\n", "")
.replaceAll("\r", "");
} catch (IOException e) {
e.printStackTrace();
}
return png_base64;
}
// 测试
public static void main(String[] args) {
File file = new File("E:\\aaa.pdf");
String htmlPath = "E:\\aaaa.html";
InputStream inputStream = null;
BufferedImage bufferedImage = null;
try {
inputStream = new FileInputStream(file);
bufferedImage = pdfStreamToPng(inputStream,"");
String base64_png = bufferedImageToBase64(bufferedImage);
createHtmlByBase64(base64_png, htmlPath, "授权书");
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
try {
if (inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
html转换
package com.soft.mpms.zframe.common;
import org.apache.commons.lang.StringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @ClassName 富文本编辑器提取image路径和文字
* @description: TODO
* @author: hhs
* @Date 2022/4/26 0026 下午 13:43
**/
public class HtmlToText {
private static Pattern p_image= Pattern.compile("<img.*src\\s*=\\s*(.*?)[^>]*?>",Pattern.CASE_INSENSITIVE);
private static Pattern r_image=Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)");
/**
* 提取富文本中纯文本
*/
public static String getText(String richText) {
if (StringUtils.isEmpty(richText)){
return "";
}
String regx = "(<.+?>)|(</.+?>)";
Matcher matcher = Pattern.compile(regx).matcher(richText);
while (matcher.find()) {
// 替换图片
richText = matcher.replaceAll("").replace(" ", "");
}
richText = richText.replaceAll("\\s*|\t|\r|\n", "");
//去除html的 标签
if (richText.contains(" ")) {
richText= richText.replaceAll(" ", "");
}
//取200字
richText = richText.length() > 200?richText.substring(0, 200):richText;
return richText;
}
/**
* 提取富文本中图片地址
*/
public static List<String> getImgStr(String richText) {
List<String> list = new ArrayList<>();
if (StringUtils.isNotEmpty(richText)){
if(richText.contains("<") && richText.contains(">")){
Matcher pMatcher = p_image.matcher(richText);
while (pMatcher.find()) {
// 得到<img />数据
String img = pMatcher.group();
// 匹配<img>中的src数据
Matcher rMatcher = r_image.matcher(img);
while (rMatcher.find()) {
list.add(rMatcher.group(1));
}
}
}else {
return list;
}
}
return list;
}
}