本文使用 poi 和 xdocreport 组件,在其基础自定义实现某些功能
最近有个需求,文档的转换,需要把Word文档转换为编辑器可识别支持的HTML格式类型,Apache的开源组件poi可以解析docx和doc类型的文档,于是使用该组件实现需求
关于 Word文档的俩种格式,docx格式是一种压缩文件,由 xml 格式文件组成
docx:office2007版本之后的格式
doc:office2003版本之前的格式
使用 xdocreport 的默认实现,效果如下,由此可见,实现效果是由样式来渲染的,一级标题并非真的一级标题,而只是具体一级标题样式的普通文本而已,因此我们需要自定义实现来解决这个问题
- Word 的原格式
- 转换效果
- HTML 格式源代码
- 引入maven依赖
<!-- poi start 基础依赖 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi-version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${poi-version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi-version}</version>
</dependency>
<dependency>
<groupId>com.deepoove</groupId>
<artifactId>poi-ooxml-schemas-extra</artifactId>
<version>${poi-version}</version>
</dependency>
<dependency>
<groupId>com.deepoove</groupId>
<artifactId>poi-tl</artifactId>
<version>1.10.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.4</version>
</dependency>
<!-- poi end -->
<!-- xdocreport start -->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>2.0.2</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
<version>2.0.1</version>
</dependency>
<!-- xdocreport end -->
自定义解析类继承 xdocreport 提供的默认实现类,重写我们需要自定义实现的部分,其他部分可以调用父类方法即可,先贴上代码
import fr.opensagres.poi.xwpf.converter.core.utils.StringUtils;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import fr.opensagres.poi.xwpf.converter.xhtml.internal.XHTMLMapper;
import fr.opensagres.poi.xwpf.converter.xhtml.internal.styles.CSSStyle;
import fr.opensagres.poi.xwpf.converter.xhtml.internal.styles.CSSStylePropertyConstants;
import fr.opensagres.poi.xwpf.converter.xhtml.internal.utils.SAXHelper;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.impl.values.XmlValueDisconnectedException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import java.util.*;
import static fr.opensagres.poi.xwpf.converter.xhtml.internal.XHTMLConstants.*;
/**
* @author hujh
* @date 2023/4/4 15:21
*/
@Slf4j
public class CustomHTMLMapper extends XHTMLMapper {
private final ContentHandler contentHandler;
private boolean pageBreakOnNextParagraph;
private AttributesImpl currentRunAttributes;
private XWPFParagraph currentParagraph;
private List<IBodyElement> allBodyElements;
private XWPFStyles styles;
private static final int EM_NUM = 100;
public static final String EM_STYLE = "line-height:";
private int size;
// 构造方法,文档对象xwpfDocument赋值
public WtHTMLMapper(XWPFDocument document, ContentHandler contentHandler, XHTMLOptions options, XWPFStyles styles) throws Exception {
super(document, contentHandler, options);
this.contentHandler = contentHandler;
this.styles = styles;
}
@Override
public void start() throws Exception {
// 开始观测文档
Object container = startVisitDocument();
// 提取文档所有的段落信息
List<IBodyElement> bodyElements = document.getBodyElements();
this.allBodyElements = bodyElements;
// 段落处理
visitBodyElements(bodyElements, container);
// 结束观测文档
endVisitDocument();
}
@Override
protected void visitBodyElements(List<IBodyElement> bodyElements, Object container) throws Exception {
size = bodyElements.size();
int tmpSize = bodyElements.size();
for (int i = 0; i < tmpSize; i++) {
IBodyElement bodyElement = bodyElements.get(i);
switch (bodyElement.getElementType()) {
// 普通段落信息
case PARAGRAPH:
XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
try {
// 判断该段落是否是多级列表
if (containMultilevel(paragraph)) {
// 是多级列表,自定义处理
visitMultilevelParagraph(paragraph, i);
} else {
// 不是多级列表的段落,普通处理
visitParagraph(paragraph, i, container);
}
} catch (XmlValueDisconnectedException e) {
log.error("解析段落失败,跳过该段落继续执行,段落位置:{}", i);
}
break;
// 表格段落信息
case TABLE:
AttributesImpl divLevel = new AttributesImpl();
SAXHelper.addAttrValue(divLevel, "data-component", "TableComponent");
startElement("div", divLevel);
visitTable((XWPFTable) bodyElement, i, container);
endElement("div");
break;
case CONTENTCONTROL:
visitSDT((XWPFSDT) bodyElement, i, container);
break;
// 无类型匹配的打印日志
default:
log.info("匹配到无法处理的段落");
}
}
}
// 父类的默认实现
@Override
protected void visitParagraph(XWPFParagraph paragraph, int index, Object container) throws Exception {
if (pageBreakOnNextParagraph) {
pageBreak();
}
this.pageBreakOnNextParagraph = false;
Object paragraphContainer = super.startVisitParagraph(paragraph, null, container);
// 需要处理的的是:多级列表
visitParagraphBody(paragraph, index, paragraphContainer);
endVisitParagraph(paragraph, container, paragraphContainer);
}
protected void visitMultilevelParagraph(XWPFParagraph curParagraph, int index) throws Exception {
// 获取段落样式,无序列表是的 NumFmt 是 bullet,其他均识别为有序列表
String numFmt = curParagraph.getNumFmt();
String label = EnvConstant.BULLET.equals(numFmt) ? WtXHTMLConstants.UN_ORDER_ELEMENT : WtXHTMLConstants.ORDER_LIST_ELEMENT;
log.info("numFmt:{}", numFmt);
// 取前一个段落
XWPFParagraph preParagraph = (XWPFParagraph) (index == 0 ? allBodyElements.get(0) : allBodyElements.get(index - 1));
// 取后一个段落
IBodyElement iBodyElement = index + 1 == size ? allBodyElements.get(size - 1) : allBodyElements.get(index + 1);
XWPFParagraph nextParagraph = null;
// 判断后一个段落是否是普通段落,非普通段落则为 null
if (iBodyElement instanceof XWPFParagraph) {
nextParagraph = (XWPFParagraph) (iBodyElement);
}
// 后一个段落非普通段落,可能是表格段落
if (nextParagraph == null) {
// 内容读取转换
visitParagraphBody(curParagraph, index, null);
// ul / ol 标签结束
endVisitMultiParagraph(true, 1, false, 0, label);
return;
}
// 该段落是否需要分页
if (pageBreakOnNextParagraph) {
pageBreak();
}
this.pageBreakOnNextParagraph = false;
// 获取前一个段落,当前段落,后一个段落的序号信息
CTNumPr curNumPr = getNumPr(curParagraph);
CTNumPr nextNumPr = getNumPr(nextParagraph);
CTNumPr preNumPr = getNumPr(preParagraph);
// 标签是否需要闭合
boolean isNeedClose = false;
// 标签是否是新的一个多级列表
boolean firstOpen = false;
// 当前段落的编号信息
int curLevel = curNumPr.getIlvl().getVal().intValue();
int curNumId = curNumPr.getNumId().getVal().intValue();
// 后一个段落不是多级列表,标签需要闭合
if (!containMultilevel(nextParagraph)) {
// 前一个段落不是多级列表,标签需要开启
if (!containMultilevel(preParagraph)) {
startOrderElement(curLevel + 1, label, true);
}
visitParagraphBody(curParagraph, index, null);
endVisitMultiParagraph(true, curLevel + 1, false, 0, label);
return;
}
// 下一个段落的编号信息
int nextLevel = nextNumPr.getIlvl().getVal().intValue();
int nextNumId = nextNumPr.getNumId().getVal().intValue();
boolean preParagraphNotIsLevel = false;
if (preNumPr != null) {
CTDecimalNumber preNumId = preNumPr.getNumId();
CTDecimalNumber preIlvl = preNumPr.getIlvl();
preParagraphNotIsLevel = preNumId != null && preIlvl != null;
}
// 该段落是该多级列表的第一个列表,需要插入一个 ul/ol 标签
if (!preParagraphNotIsLevel || curParagraph.equals(allBodyElements.get(0))) {
// 设置 level 信息
startOrderElement(curLevel + 1, label, true);
firstOpen = true;
}
// 如果当前段落已经是最后一个段落,则标签需要闭合
if (curParagraph.equals(allBodyElements.get(size - 1))) {
isNeedClose = true;
}
// 文字内容处理
visitParagraphBody(curParagraph, index, null);
int closeNum = 0;
// 当前段落和下一个段落属于同一个多级列表
if (curNumId == nextNumId) {
// 下一层级大于当前层级,开启标签
if (nextLevel > curLevel && !firstOpen) {
// 下一次层级大于当前层级,标签开始,后续要闭合
startOrderElement(nextLevel + 1, label, false);
}
// 下一个层级小于当前层级,标签闭合,计算闭合次数
if (nextLevel < curLevel) {
closeNum = curLevel - nextLevel;
isNeedClose = true;
}
}
// 第一层级有多个 li
if (curLevel == nextLevel) {
firstOpen = false;
}
endVisitMultiParagraph(isNeedClose, closeNum, firstOpen, curLevel + 1, label);
}
// 开启 ul/ol 标签,添加 level 属性、type 属性
protected void startOrderElement(int level, String label, boolean componentName) throws SAXException {
AttributesImpl multiLevel = new AttributesImpl();
SAXHelper.addAttrValue(multiLevel, WtXHTMLConstants.LEVEL, level);
if (componentName) {
SAXHelper.addAttrValue(multiLevel, WtXHTMLConstants.COMPONENT_NAME, "List");
}
SAXHelper.addAttrValue(multiLevel, WtXHTMLConstants.TYPE, ConvertHtmlUtils.getSequence(level));
startElement(label, multiLevel);
}
// 获取段落的编号信息
protected CTNumPr getNumPr(XWPFParagraph paragraph) {
CTNumPr paragraphNumPr = stylesDocument.getParagraphNumPr(paragraph);
return getNumPr(paragraphNumPr);
}
// 闭合 ul/ol 标签
protected void endVisitMultiParagraph(boolean isNeedClose, int closeNum, boolean firstOpen, int level, String label) throws SAXException {
if (firstOpen) {
startOrderElement(level + 1, label, false);
}
if (isNeedClose) {
for (int i = 0; i < closeNum; i++) {
endElement(label);
}
}
}
@Override
protected void endVisitParagraph(XWPFParagraph paragraph, Object parentContainer, Object paragraphContainer) throws Exception {
endElement(P_ELEMENT);
}
@Override
protected void visitRun(XWPFRun run, boolean pageNumber, String url, Object paragraphContainer)
throws Exception {
if (run.getParent() instanceof XWPFParagraph) {
this.currentParagraph = (XWPFParagraph) run.getParent();
}
// 1) create attributes
// 1.1) Create "class" attributes.
this.currentRunAttributes = createClassAttribute(this.currentParagraph.getStyleID());
// 1.2) Create "style" attributes.
CTRPr rPr = run.getCTR().getRPr();
CSSStyle cssStyle = getStylesDocument().createCSSStyle(rPr);
if (cssStyle != null) {
cssStyle.addProperty(CSSStylePropertyConstants.WHITE_SPACE, "pre-wrap");
}
this.currentRunAttributes = createStyleAttribute(cssStyle, currentRunAttributes);
String numFmt = this.currentParagraph.getNumFmt();
List<XWPFRun> runs = currentParagraph.getRuns();
if (containMultilevel(this.currentParagraph)) {
int hyperSize = (int) runs.stream().filter(XWPFHyperlinkRun.class::isInstance).count();
if (hyperSize == 0) {
setRunsText(this.currentParagraph, runs, run);
}
}
// 判断是否是一级 - 六级标题
boolean isHeading = isHeading(this.currentParagraph);
String styleId = getHeadingLevel(this.currentParagraph);
// 判断段落是否存在段前段后间距
int beforeLine = this.currentParagraph.getSpacingBeforeLines();
if (beforeLine != -1) {
double tmpEm = (double) beforeLine / EM_NUM;
AttributesImpl attributes = new AttributesImpl();
SAXHelper.addAttrValue(attributes, WtXHTMLConstants.STYLE, EM_STYLE + (tmpEm + 1) + "em");
startElement(WtXHTMLConstants.PARAGRAPH, attributes);
}
// 自定义下划线处理 u
if (isUnderLine(this.currentParagraph)) {
startElement(WtXHTMLConstants.U);
}
// 自定义标题处理 h1-h6
if (StringUtils.isNotEmpty(styleId) && isHeading && url == null) {
startElement(WtXHTMLConstants.HEADING + styleId);
setRunsText(this.currentParagraph, runs, run);
}
// 自定义列表处理 li
if (StringUtils.isNotEmpty(numFmt)) {
startElement(WtXHTMLConstants.LIST_ELEMENT);
}
// 文字提取
super.visitRun(run, pageNumber, url, paragraphContainer);
if (StringUtils.isNotEmpty(numFmt)) {
endElement(WtXHTMLConstants.LIST_ELEMENT);
}
if (StringUtils.isNotEmpty(styleId) && isHeading && url == null) {
endElement(WtXHTMLConstants.HEADING + styleId);
}
if (beforeLine != -1) {
endElement(WtXHTMLConstants.PARAGRAPH);
}
if (isUnderLine(this.currentParagraph)) {
endElement(WtXHTMLConstants.U);
}
this.currentRunAttributes = null;
this.currentParagraph = null;
}
private AttributesImpl createClassAttribute(String styleID) {
String classNames = getStylesDocument().getClassNames(styleID);
if (StringUtils.isNotEmpty(classNames)) {
return SAXHelper.addAttrValue(null, CLASS_ATTR, classNames);
}
return null;
}
private AttributesImpl createStyleAttribute(CSSStyle cssStyle, AttributesImpl attributes) {
if (cssStyle != null) {
String inlineStyles = cssStyle.getInlineStyles();
if (StringUtils.isNotEmpty(inlineStyles)) {
attributes = SAXHelper.addAttrValue(attributes, STYLE_ATTR, inlineStyles);
}
}
return attributes;
}
private void startElement(String name) throws SAXException {
startElement(name, null);
}
private void startElement(String name, Attributes attributes) throws SAXException {
SAXHelper.startElement(contentHandler, name, attributes);
}
private void endElement(String name) throws SAXException {
SAXHelper.endElement(contentHandler, name);
}
private void characters(String content) throws SAXException {
SAXHelper.characters(contentHandler, content);
}
private CTNumPr getNumPr(CTNumPr numPr) {
if (numPr != null) {
XWPFNum num = getXWPFNum(numPr);
if (num != null) {
XWPFAbstractNum abstractNum = getXWPFAbstractNum(num);
CTString numStyleLink = abstractNum.getAbstractNum().getNumStyleLink();
String styleId = numStyleLink != null ? numStyleLink.getVal() : null;
if (styleId != null) {
CTStyle style = stylesDocument.getStyle(styleId);
CTPPr ppr = style.getPPr();
if (ppr == null) {
return null;
}
return getNumPr(ppr.getNumPr());
}
}
}
return numPr;
}
/**
* 判断是否是多级列表,todo 覆盖一下Word的各种格式能否在通过这个判断
* 主要根据段落信息中的 NumFmt 来判断
*/
private boolean containMultilevel(XWPFParagraph paragraph) {
String numFmt = paragraph.getNumFmt();
return StringUtils.isNotEmpty(numFmt);
}
private boolean isHeading(XWPFParagraph paragraph) {
// 判断是否是一级 - 六级标题
String styleId = paragraph.getStyleID();
if (StringUtils.isEmpty(styleId)) {
return false;
}
// 根据样式表获取样式信息
XWPFStyle style = styles.getStyle(styleId);
String name = style.getName();
return name.startsWith("heading");
}
private String getHeadingLevel(XWPFParagraph paragraph) {
String styleId = "";
// 判断是否是一级 - 六级标题
if (isHeading(paragraph)) {
styleId = paragraph.getStyleID();
XWPFStyle style = styles.getStyle(styleId);
styleId = style.getName().replace("heading", "").replace(" ", "");
if (Integer.parseInt(styleId) > 6) {
styleId = "6";
}
}
return styleId;
}
private boolean isUnderLine(XWPFParagraph paragraph) {
CTPPr pPr = paragraph.getCTP().getPPr();
if (pPr != null) {
CTParaRPr paraRpr = pPr.getRPr();
if (paraRpr != null) {
CTUnderline underline = paraRpr.getU();
if (underline == null) {
return false;
}
String value = underline.getVal().toString();
return !"none".equals(value);
}
}
return false;
}
private void setRunsText(XWPFParagraph paragraph, List<XWPFRun> runs, XWPFRun currentRun) {
StringBuilder text = new StringBuilder(currentRun.text());
if (runs.size() > 1) {
for (int i = 1; i < runs.size(); i++) {
text.append(runs.get(i).text());
}
currentRun.setText(text.toString(), 0);
for (int i = runs.size() - 1; i > 0; i--) {
paragraph.removeRun(i);
}
}
}
}
使用自定义实现,效果如下
-
转换 HTML 效果
-
转换 HTML 源码
转换完成
一些其他知识
转换时,Word 文档中的图片上传 oss ,并且替换成 URL 地址
import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.multipart.MultipartFile;
import java.io.ByteArrayInputStream;
import java.io.IOException;
/**
* @author hujh
* @date 2023/3/20 16:19
*/
@Slf4j
public class CustomImageManager extends ImageManager {
private String fileId;
private static final String SUFFIX_URL = "baseUrl";
/**
* 构造方法的基本路径随意填,上传 oss 不走这步
*/
public CustomImageManager() {
super(null, null);
}
/**
* 文件上传到 oss
*/
@Override
public void extract(String imagePath, byte[] imageData) throws IOException {
FileServerService fileServer = SpringUtil.getBean(FileServerService.class);
try (ByteArrayInputStream inputStream = new ByteArrayInputStream(imageData)) {
// 上传 oss ... 具体实现看个人需求
fileId = "this is new picture url";
}
log.info("图片地址赋值:{}", fileId);
}
/**
* 替换文件路径
*/
@Override
public String resolve(String uri) {
log.info("原地址:{},替换为:{}", uri, fileId);
return SUFFIX_URL + fileId;
}
}
标签:段落,String,自定义,poi,paragraph,private,HTML,POI,null From: https://www.cnblogs.com/hujh2022/p/17376851.html关于这些标签和内容是如何生成 HTML 的,在写入标签时,其实是对 outputStream 输出流写入内容,具体写入的核心方法如下