目录
word转html
1. maven依赖
<!--word解析html -->
<!-- 针对2007以上版本的库docx -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 针对2003版本的库doc -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>2.0.2</version>
</dependency>
2. 实例
package com.baidu.cms.utils;
import cn.hutool.core.img.ImgUtil;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;
/**
* word 转换成html
*/
public class WordUtils {
private final static Logger log = LoggerFactory.getLogger(WordUtils.class);
/**word转写html
* @param sourcePath 源文件路径
* @param outPath 解析后的文件路径
* @return
*/
public static boolean word2Html(String sourcePath, String outPath) {
boolean flag = false;
try {
File file = new File(sourcePath);
if (!file.exists()) {
return flag;
}
String fName = file.getName();
String suffix = fName.substring(fName.lastIndexOf(".") + 1).toLowerCase();
if (suffix.endsWith("doc")) {
flag = docToHtml(sourcePath, outPath);
} else if (suffix.endsWith("docx")) {
flag = docxToHtml(sourcePath, outPath);
}
// 新增标签-解决中文内容乱码
boolean editFlag = editHtml(outPath);
log.info("word2html({}->{}):parser({});edit({})", sourcePath, outPath, flag, editFlag);
} catch (Exception e) {
e.printStackTrace();
log.error("word2htmlError({}->{}):{}", sourcePath, outPath, String.valueOf(e));
}
return flag;
}
/**
* 将word2003转换为html文件
* @param wordPath word文件路径
* @param htmlPath html文件路径
*/
public static boolean docToHtml(String wordPath, String htmlPath) {
boolean flag = false;
try {
File htmlFile = new File(htmlPath);
// 原word文档
InputStream input = new FileInputStream(new File(wordPath));
HWPFDocument wordDocument = new HWPFDocument(input);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> {
BufferedImage bufferedImage = ImgUtil.toImage(content);
String base64Img = ImgUtil.toBase64(bufferedImage, pictureType.getExtension());
// 带图片的word,则将图片转为base64编码,保存在一个页面中
StringBuilder sb = (new StringBuilder(base64Img.length() + "data:;base64,".length()).append("data:;base64,").append(base64Img));
return sb.toString();
});
// 解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
// 生成html文件上级文件夹
File folder = htmlFile.getParentFile();
if (!folder.exists()) {
folder.mkdirs();
}
// 生成html文件地址
OutputStream outStream = new FileOutputStream(htmlFile);
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
outStream.close();
flag = true;
} catch (Exception e) {
e.printStackTrace();
log.error("Doc解析异常({}->{}):{}", wordPath, htmlPath, String.valueOf(e));
}
return flag;
}
/**
* 2007版本word转换成html
* @param wordPath word文件路径
* @param htmlPath html文件路径
* @return
* @throws IOException
*/
public static boolean docxToHtml(String wordPath, String htmlPath) {
boolean flag = false;
try {
ZipSecureFile.setMinInflateRatio(-1.0d);
File htmlFile = new File(htmlPath);
File parentFile = htmlFile.getParentFile();
if (!parentFile.exists()) {
parentFile.mkdirs();
}
// 图片保存路径
String imagePath = parentFile.getPath() + "image" + File.separator;
// word文件
File wordFile = new File(wordPath);
// 加载word文档生成 XWPFDocument对象
InputStream in = new FileInputStream(wordFile);
XWPFDocument document = new XWPFDocument(in);
// 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
File imgFolder = new File(imagePath);
// 带图片的word,则将图片转为base64编码,保存在一个页面中
XHTMLOptions options = XHTMLOptions.create().indent(4).setImageManager(new Base64EmbedImgManager());
// 将 XWPFDocument转换成XHTML
OutputStream out = new FileOutputStream(htmlFile);
XHTMLConverter.getInstance().convert(document, out, options);
flag = true;
} catch (Exception e) {
e.printStackTrace();
log.error("Docx解析异常({}->{}):{}", wordPath, htmlPath, String.valueOf(e));
}
return flag;
}
/**
* 编辑html 新增标签元素-解决偶尔出现的中文内容乱码
* @param htmlPath
* @return
*/
public static boolean editHtml(String htmlPath) {
boolean flag = false;
BufferedReader br = null;
BufferedWriter bw = null;
try{
// 读取html
br = new BufferedReader(new FileReader(htmlPath));
// 不使用按行读取(样式会有一定问题)
String line;
StringBuilder cb = new StringBuilder();
while ((line=br.readLine()) != null){
cb.append(line);
}
br.close();
// 修改html
String content = cb.toString();
int i = content.indexOf("</head>");
String newContent = new StringBuilder(content).insert(i, "<meta http-equiv='Content-Type' content='text/html;charset=utf-8'/>").toString();
// 写入到html
bw = new BufferedWriter(new FileWriter(htmlPath));
bw.write(newContent);
bw.close();
flag = true;
}catch (Exception e){
e.printStackTrace();
try {
if (br != null){
br.close();
}
if (bw != null){
bw.close();
}
}catch (IOException ex){
ex.printStackTrace();
}
}
return flag;
}
public static void main(String[] args) {
// 相关文档转换docx必须通过专业办公软件变更 直接更改后缀 本质未转换为docx 会出现异常
word2Html("G:\\test\\download\\test.docx", "G:\\test\\download\\1.html");
word2Html("G:\\test\\download\\test.doc", "G:\\test\\download\\2.html");
}
}
标签:word,String,import,flag,html,poi,new From: https://www.cnblogs.com/fsh19991001/p/17815861.html