简介
-
在现代办公环境中,Word文档和HTML页面都是常见的信息表达方式。有时,我们需要将Word文档转换为HTML格式,以便在网页上展示或进行进一步的处理。本文将介绍如何使用Apache POI库和Jsoup库来实现Word文档到HTML的转换,并处理文档中的图片资源。
-
环境准备
-
Java开发环境
-
Apache POI库
-
Jsoup库
-
Hutool工具库(用于简化文件和字符串操作)
1. 步骤1:创建项目和依赖
- 首先,我们需要创建一个Java项目,并添加Apache POI和Jsoup库的依赖。可以通过Maven或Gradle来管理这些依赖。
<poi.version>4.1.2</poi.version>
<!--注意版本保持一致 poi poi-ooxml poi-scratchpad-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
</dependency>
<!-- 操作doc ppt xls -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>
<!-- 操作docx pptx xlsx -->
<!--word S-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
<version>2.0.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
步骤2:读取Word文档
使用Apache POI库读取Word文档。对于.docx文件,使用XWPFDocument类;对于.doc文件,使用HWPFDocument类。
docx:
/**
* 解析docx成html
*
* @param file
* @return
* @throws IOException
*/
public static String Word2007ToHtml(MultipartFile file) throws IOException {
if (file.isEmpty() || file.getSize() <= 0) {
log.error("Sorry File does not Exists!");
return null;
} else {
if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")) {
// 1) 加载word文档生成 XWPFDocument对象
InputStream in = file.getInputStream();
XWPFDocument document = new XWPFDocument(in);
setParagraphs(document);
//解析XHTML配置
XHTMLOptions xhtmlOptions = XHTMLOptions.create();
// 将样式都写为内联样式,而不是写到style标签中 默认false
xhtmlOptions.setFragment(true);
xhtmlOptions.setIgnoreStylesIfUnused(false);
// 也可以使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(document, baos, xhtmlOptions);
String content = baos.toString();
org.jsoup.nodes.Document doc = setDocXStyle(content);
baos.close();
return doc.html();
} else {
log.error("Enter only MS Office 2007+ files");
return null;
}
}
}
doc读取代码
/**
* 解析doc成html 并保存图片文件到本地
*
* @param file
* @return
* @throws IOException
* @throws ParserConfigurationException
* @throws TransformerException
*/
public static String Word2003ToHtmlAndSaveImage(MultipartFile file) throws IOException {
//使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStream outStream = new BufferedOutputStream(baos);
try {
//将上传的文件传入Document转换
HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
//处理标题
Range range = wordDocument.getRange();
HashSet<String> hashSet = new HashSet<>();
for (int i = 0; i < range.numParagraphs(); i++) {
Paragraph paragraph = range.getParagraph(i);
int lvl = paragraph.getLvl();
if (StrUtil.isNotEmpty(paragraph.text().trim())) {
//标题
if (lvl == 1 || lvl == 0 || lvl == 2 || lvl == 4) {
String newContent = HEADLINE_DJ_ + RandomUtil.randomNumbers(8) + "_";
paragraph.insertBefore(newContent);
// paragraph.replaceText(paragraph.text(), newContent+paragraph.text() );
hashSet.add(newContent);
}
}
}
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
//将读取到的图片上传并添加链接地址
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
@Override
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
return getImgUrl(suggestedName, content);
}
});
// word文档转Html文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
String content = baos.toString();
// 使用Jsoup解析HTML内容
org.jsoup.nodes.Document doc = Jsoup.parse(content);
// 查找并修改目标段落
hashSet.forEach(text -> {
text = getReplaceAllNR(text);
Elements elements = doc.select("p");
String finalText = text;
elements.forEach(el -> {
if (el.text().contains(finalText)) {
Element h2Element = new Element("h2");
h2Element.text(el.text().replaceAll(finalText, ""));
h2Element.attr("style", "font-weight: bold;font-size: 22px;");
el.replaceWith(h2Element);
}
});
// 添加CSS样式
org.jsoup.select.Elements elementsfontSize = doc.select("span");
elementsfontSize.forEach(fs -> fs.attr("style", "font-size: 19px; "));
});
// log.info("docToHtmlText--->{}", content);
return doc.html();
} catch (Exception e) {
log.error("docToHtmlText 异常", e);
throw new MyException("docToHtmlText 异常!", e.getMessage());
} finally {
baos.close();
outStream.close();
}
}
步骤3:转换文档内容为HTML
对于.docx文件,使用XWPF库提供的XHTMLConverter类将文档内容转换为HTML。对于.doc文件,使用WordToHtmlConverter类进行转换。
步骤4:处理文档中的图片
在转换过程中,需要特别处理文档中的图片。对于.docx文件,可以通过遍历段落和运行(XWPFRun)来获取图片数据,并使用MinioUtil(或其他存储服务)保存图片并获取图片的URL。对于.doc文件,可以通过PicturesManager接口来处理图片的保存和链接更新。
步骤5:样式调整和优化
使用Jsoup库对生成的HTML内容进行样式调整和优化。例如,可以通过CSS选择器来定位并修改特定的元素样式。
处理html,添加h2标签到富文本,显示目录用。
// 遍历文档中的段落,标记标题
private static void setParagraphs(XWPFDocument document) {
// 遍历文档中的段落,标记标题
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
boolean isTitle = false;
String style = paragraph.getStyle();
if (StrUtil.isEmpty(style)) {
continue;
}
//paragraph.getStyle() 是获取段落标题等级
if (style.equals("1") || style.equals("2") || style.equals("3") || style.equals("4")) {
Long res = new Date().getTime();
String headline = HEADLINE_DJ_ + RandomUtil.randomNumbers(5) + "_" + res;
paragraph.setStyle(headline);
isTitle = true;
}
for (XWPFRun run : paragraph.getRuns()) {
run.setFontSize(40);
if (isTitle) {
run.setFontSize(60);
run.setBold(true);
}
}
}
}
步骤6:完整代码示例
点击查看代码
package com.csot.kms.common.poi;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.IdUtil;
import cn.hutool.core.util.RandomUtil;
import cn.hutool.core.util.StrUtil;
import com.csot.kms.common.utill.MinioUtil;
import com.csot.kms.common.valid.MyException;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.*;
@Slf4j
public class WordToHtmlPoiUtil {
public static final String HEADLINE_DJ_ = "HEADLINE_DJ_";
public static String docImport(MultipartFile file) throws Exception {
//返回的html字符串
String html = "";
//word类型是否doc
boolean wordType = false;
//临时图片存放文件夹 图片会保存在此路径(临时保存)(doc类型使用)
String docsTempImages = System.getProperty("java.io.tmpdir") + IdUtil.simpleUUID() + "/";
//判断类型
String suffixName = file.getOriginalFilename().substring(file.getOriginalFilename().lastIndexOf("."));
if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")) {
html = WordToHtmlPoiUtil.Word2007ToHtml(file);
} else if (file.getOriginalFilename().endsWith(".doc") || file.getOriginalFilename().endsWith(".DOC")) {
wordType = true;
html = WordToHtmlPoiUtil.Word2003ToHtmlAndSaveImage(file);
}
//获取图片名称和本地url,这一步上传图片到本地,并把名字和url处理成map
// String uploadPath = "";//word中的图片上传到哪
Map<String, String> imageMaps = WordToHtmlPoiUtil.getImageMaps(docsTempImages, file);
//如果有图片替换成本地地址
if (!imageMaps.isEmpty()) {
html = replaceImgToLocal(html, imageMaps, wordType);
}
html = getReplaceAllNR(html);
return html;
}
/**
* 解析docx成html
*
* @param file
* @return
* @throws IOException
*/
public static String Word2007ToHtml(MultipartFile file) throws IOException {
if (file.isEmpty() || file.getSize() <= 0) {
log.error("Sorry File does not Exists!");
return null;
} else {
if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")) {
// 1) 加载word文档生成 XWPFDocument对象
InputStream in = file.getInputStream();
XWPFDocument document = new XWPFDocument(in);
setParagraphs(document);
//解析XHTML配置
XHTMLOptions xhtmlOptions = XHTMLOptions.create();
// 将样式都写为内联样式,而不是写到style标签中 默认false
xhtmlOptions.setFragment(true);
xhtmlOptions.setIgnoreStylesIfUnused(false);
// 也可以使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(document, baos, xhtmlOptions);
String content = baos.toString();
org.jsoup.nodes.Document doc = setDocXStyle(content);
baos.close();
return doc.html();
} else {
log.error("Enter only MS Office 2007+ files");
return null;
}
}
}
private static org.jsoup.nodes.Document setDocXStyle(String content) {
//根据标记替换标题内容
org.jsoup.nodes.Document doc = Jsoup.parse(content);
Elements elements = doc.select("p[class^=HEADLINE_DJ_]");
for (Element element : elements) {
Element h2Element = new Element("h2");
h2Element.attr("style", "font-weight: bold;font-size: 22px;");
h2Element.text(element.text());
element.replaceWith(h2Element);
}
// 添加CSS样式
Elements elementsfontSize = doc.select("span");
elementsfontSize.forEach(fs -> fs.attr("style", "font-size: 19px; "));
return doc;
}
// 遍历文档中的段落,标记标题
private static void setParagraphs(XWPFDocument document) {
// 遍历文档中的段落,标记标题
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
boolean isTitle = false;
String style = paragraph.getStyle();
if (StrUtil.isEmpty(style)) {
continue;
}
if (style.equals("1") || style.equals("2") || style.equals("3") || style.equals("4")) {
Long res = new Date().getTime();
String headline = HEADLINE_DJ_ + RandomUtil.randomNumbers(5) + "_" + res;
paragraph.setStyle(headline);
isTitle = true;
}
for (XWPFRun run : paragraph.getRuns()) {
run.setFontSize(40);
if (isTitle) {
run.setFontSize(60);
run.setBold(true);
}
}
}
}
/**
* 解析doc成html 并保存图片文件到本地
*
* @param file
* @return
* @throws IOException
* @throws ParserConfigurationException
* @throws TransformerException
*/
public static String Word2003ToHtmlAndSaveImage(MultipartFile file) throws IOException {
//使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStream outStream = new BufferedOutputStream(baos);
try {
//将上传的文件传入Document转换
HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
//处理标题
Range range = wordDocument.getRange();
HashSet<String> hashSet = new HashSet<>();
for (int i = 0; i < range.numParagraphs(); i++) {
Paragraph paragraph = range.getParagraph(i);
int lvl = paragraph.getLvl();
if (StrUtil.isNotEmpty(paragraph.text().trim())) {
//标题
if (lvl == 1 || lvl == 0 || lvl == 2 || lvl == 4) {
String newContent = HEADLINE_DJ_ + RandomUtil.randomNumbers(8) + "_";
paragraph.insertBefore(newContent);
// paragraph.replaceText(paragraph.text(), newContent+paragraph.text() );
hashSet.add(newContent);
}
}
}
Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
//将读取到的图片上传并添加链接地址
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
@Override
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
return getImgUrl(suggestedName, content);
}
});
// word文档转Html文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
String content = baos.toString();
// 使用Jsoup解析HTML内容
org.jsoup.nodes.Document doc = Jsoup.parse(content);
// 查找并修改目标段落
hashSet.forEach(text -> {
text = getReplaceAllNR(text);
Elements elements = doc.select("p");
String finalText = text;
elements.forEach(el -> {
if (el.text().contains(finalText)) {
Element h2Element = new Element("h2");
h2Element.text(el.text().replaceAll(finalText, ""));
h2Element.attr("style", "font-weight: bold;font-size: 22px;");
el.replaceWith(h2Element);
}
});
// 添加CSS样式
org.jsoup.select.Elements elementsfontSize = doc.select("span");
elementsfontSize.forEach(fs -> fs.attr("style", "font-size: 19px; "));
});
// log.info("docToHtmlText--->{}", content);
return doc.html();
} catch (Exception e) {
log.error("docToHtmlText 异常", e);
throw new MyException("docToHtmlText 异常!", e.getMessage());
} finally {
baos.close();
outStream.close();
}
}
private static String getReplaceAllNR(String text) {
text = text.replaceAll("\r\n", "")
.replaceAll("\r", "")
.replaceAll("\n", "");
return text;
}
/**
* 获取word中的图片名称和本地url(doc或docx)
* 返回map<图片名称, 存储的图片url地址>
*
* @param
* @param docsTempImages 本地临时图片存放地址(这个工具类Word2003ToHtmlAndSaveImage的方法存到了系统临时文件夹里)
* @param file
* @return
* @throws IOException
*/
public static Map<String, String> getImageMaps(String docsTempImages, MultipartFile file) throws Exception {
//返回map
HashMap<String, String> map = new HashMap<>();
if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")) {
//获取存在word里的图片文件
InputStream in = file.getInputStream();
XWPFDocument document = new XWPFDocument(in);
List<XWPFParagraph> paragraphs = document.getParagraphs();
if (CollUtil.isNotEmpty(paragraphs)) {
paragraphs.forEach(p -> {
List<XWPFRun> runs = p.getRuns();
if (CollUtil.isNotEmpty(runs)) {
runs.forEach(r -> {
List<XWPFPicture> pictures = r.getEmbeddedPictures();
if (CollUtil.isNotEmpty(pictures)) {
pictures.forEach(c -> {
//这里找到word中的图片的名字和数据
XWPFPictureData pictureData = c.getPictureData();
String fileName = pictureData.getFileName();
byte[] data = pictureData.getData();
String localUrl = getImgUrl(fileName, data);
map.put(pictureData.getFileName(), localUrl);
});
}
});
}
});
}
} else if (file.getOriginalFilename().endsWith(".doc") || file.getOriginalFilename().endsWith(".DOC")) {
try {
File dir = new File(docsTempImages);
//如果目录不为空遍历存储到项目中
if (!FileUtil.isEmpty(dir)) {
Arrays.asList(FileUtil.ls(docsTempImages)).forEach(f -> {
String name = f.getName();
BufferedInputStream inputStream = FileUtil.getInputStream(f);
String localUrl = getOssImgUrl(name, inputStream);
map.put(name, localUrl);
});
}
} finally {
//删除临时文件夹
FileUtil.del(docsTempImages);
}
}
return map;
}
private static String getOssImgUrl(String name, BufferedInputStream inputStream) {
Long res = new Date().getTime();
//设置文件存储路径,可以存放在你想要指定的路径里面
// 新文件名
String newFileName = res + name.substring(name.lastIndexOf("."));
String localUrl = null;
try {
localUrl = MinioUtil.upload(inputStream, newFileName);
} catch (Exception e) {
e.printStackTrace();
}
return localUrl;
}
private static String getImgUrl(String fileName, byte[] data) {
if (data.length == 0) {
return "";
}
//保存到本地获取url
Long res = new Date().getTime();
//设置文件存储路径,可以存放在你想要指定的路径里面
// 新文件名
String newFileName = res + fileName.substring(fileName.lastIndexOf("."));
InputStream inputStream = new ByteArrayInputStream(data);
String localUrl = null;
try {
localUrl = MinioUtil.upload(inputStream, newFileName);
} catch (Exception e) {
e.printStackTrace();
}
return localUrl;
}
/**
* 替换html图片的路径
*
* @param html
* @param imageMaps
* @throws
*/
public static String replaceImgToLocal(String html, Map<String, String> imageMaps, boolean wordType) {
String returnHtml = "";
//获取当前服务器ip和端口用于图片路径
org.jsoup.nodes.Document doc = Jsoup.parse(html);
// 获取 带有src属性的img元素
Elements imgTags = doc.select("img[src]");
//替换图片
for (org.jsoup.nodes.Element element : imgTags) {
String imageName = StrUtil.subAfter(element.attr("src"), "/", true);
//根据名字获取map中的url,并覆盖之前存的word中的路径
element.attr("src", imageMaps.get(imageName));
}
returnHtml = doc.toString();
//doc格式样式在外部,所以要把style从外部移到内部
if (wordType) {
returnHtml = JsoupUtils.changeHtmlCssLineStyle(doc.toString());
}
return returnHtml;
}
}