方案总结:
1:poi(html属性支持) 存在一个bug,对于table中的cell中既有文本又有图片的在转化后图片丢失
2:tika(主要是提取内容,转换出来的html不太好)
3.openoffice(依赖安装,转出的html不太好)
4. aspose(功能强大但是付费),但也可以免费使用,缺点:不支持扩展原因代码不是开源的
5.mammoth(对比poi缺少标签的属性比如颜色字体),这个不存在一个cell中既有文本又有图片的在转化后图片丢失的问题,对于样式这块支持扩展,样例很多
本文基于poi实现docx to html
1.maven依赖
<dependency> <groupId>docx4j</groupId> <artifactId>docx4j</artifactId> <version>3.3.7</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/docx4j-3.3.7.jar</systemPath> </dependency> <dependency> <groupId>org.jdom</groupId> <artifactId>jdom</artifactId> <version>2.0.6</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/jdom-2.0.6.jar</systemPath> </dependency> <dependency> <groupId>fmath</groupId> <artifactId>fmath-mathml-java</artifactId> <version>3.1</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/fmath-mathml-java-3.1.jar</systemPath> </dependency> <dependency> <groupId>org.scilab.forge</groupId> <artifactId>jlatexmath</artifactId> <version>1.0.8-SNAPSHOT</version> <scope>system</scope> <systemPath>${project.basedir}/src/main/resources/lib/jlatexmath-1.0.8-SNAPSHOT.jar</systemPath> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.11.0</version> </dependency> <dependency> <groupId>net.sf.saxon</groupId> <artifactId>Saxon-HE</artifactId> <version>9.8.0-12</version> </dependency> <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.core --> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.core</artifactId> <version>1.0.6</version> </dependency> <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.xhtml --> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.17</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> <version>4.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.16</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.xmlbeans/xmlbeans --> <dependency> <groupId>org.apache.xmlbeans</groupId> <artifactId>xmlbeans</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.14</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>3.4</version> </dependency>
2. 代码实现
2.1 docxToHtml方法
// docx转换为html public static void docxToHtml(String sourceFile, String outDir) throws Exception { if (!sourceFile.contains(".docx")) { throw new IllegalArgumentException("you should use docx file"); } FileInputStream fis = new FileInputStream(sourceFile); ZipSecureFile.setMinInflateRatio(0.004); XWPFDocument document = new XWPFDocument(fis); try { for (XWPFParagraph paragraph : document.getParagraphs()) { handle(paragraph); } String sourcePrefix = sourceFile.split("\\.")[0]; File sourceDirFile = new File(sourcePrefix); String parentDir = sourceDirFile.getParent(); String name = sourceDirFile.getName(); UUID uuid = UUID.randomUUID(); String uuidStr = uuid.toString(); uuidStr = uuidStr.replace("-", "").substring(0, 16); sourcePrefix = parentDir + File.separator + "html" + File.separator; File tempFileDir = new File(sourcePrefix); if (!tempFileDir.exists()) { tempFileDir.mkdirs(); } String tempFile = sourcePrefix + name + "-temp" + uuidStr + ".docx"; // FileOutputStream out = new FileOutputStream(tempFile); // document.write(out); // out.close(); for (XWPFTable table : document.getTables()) { for (XWPFTableRow row : table.getRows()) { for (XWPFTableCell cell : row.getTableCells()) { for (XWPFParagraph paragraph : cell.getParagraphs()) { handle(paragraph); } } } } FileOutputStream out1 = new FileOutputStream(tempFile); document.write(out1); out1.close(); fis.close(); FileOutputStream fileOutputStream = null; OutputStreamWriter outputStreamWriter = null; try { XWPFDocument documentNew = new XWPFDocument(Files.newInputStream(Paths.get(tempFile))); XHTMLOptions options = XHTMLOptions.create(); File dirFile = new File(outDir); if (!dirFile.exists()) { dirFile.mkdirs(); } StringBuilder imageDir = new StringBuilder(); imageDir.append(outDir).append("/image"); File imagrDirFile = new File(imageDir.toString()); if (!imagrDirFile.exists()) { imagrDirFile.mkdirs(); } // 存放图片的文件夹 options.setExtractor(new FileImageExtractor(new File(imageDir.toString()))); // html中图片的路径 options.URIResolver(new BasicURIResolver("image")); // XHTMLOptions options = XHTMLOptions.create(); // options.setIgnoreStylesIfUnused(false); // options.getStyleSheet().addStyle("table", "border-collapse:collapse;width:100%;"); // options.getStyleSheet().addStyle("td", "border:1px solid black;"); // options.getStyleSheet().addStyle("th", "border:1px solid black;"); // options.setStyleSheet("h3 { display: none; }"); // 自定义内容处理工厂 options.setContentHandlerFactory(new IContentHandlerFactory() { @Override public ContentHandler create(OutputStream outputStream, Writer writer, XHTMLOptions xhtmlOptions) { return outputStream != null ? new TableAwareContentHandler(outputStream, options.getIndent()) : new TableAwareContentHandler(writer, options.getIndent()); } }); String targetFilePath=outDir+File.separator + name+".html"; fileOutputStream = new FileOutputStream(targetFilePath); outputStreamWriter = new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8); XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance(); xhtmlConverter.convert(documentNew, outputStreamWriter, options); } catch (Exception e) { e.printStackTrace(); } finally { if (outputStreamWriter != null) { outputStreamWriter.close(); } if (fileOutputStream != null) { fileOutputStream.close(); } } } catch (Exception e) { e.printStackTrace(); } }
2.2 handle方法
private static void handle(XWPFParagraph paragraph) throws InvalidFormatException, IOException { CTP ctp = paragraph.getCTP(); XmlObject[] xmlObjects = ctp.selectPath( "declare namespace m='http://schemas.openxmlformats.org/officeDocument/2006/math' .//m:oMath"); for (XmlObject xmlObject : xmlObjects) { // System.out.println("在文档中发现公式: " + xmlObject.xmlText()); String mml = MathmlUtils.convertOMML2MML(xmlObject.xmlText()); String latex = MathmlUtils.convertMML2Latex(mml); latex = LatexUtil.latexFilter(latex); XmlCursor cursor = xmlObject.newCursor(); cursor.removeXml(); XmlCursor startCursor = cursor.newCursor(); startCursor.toPrevToken(); XWPFRun run = paragraph.createRun(); TeXFormula formula = new TeXFormula(latex); TeXIcon icon = formula.createTeXIcon(TeXConstants.STYLE_DISPLAY, 20); BufferedImage image = new BufferedImage(icon.getIconWidth(), icon.getIconHeight(), BufferedImage.TYPE_INT_ARGB); Graphics2D g2 = image.createGraphics(); g2.setColor(Color.WHITE); g2.fillRect(0, 0, icon.getIconWidth(), icon.getIconHeight()); JLabel jl = new JLabel(); jl.setForeground(new Color(0, 0, 0)); icon.paintIcon(jl, g2, 0, 0); // 创建一个用来存放图像字节的输出流 java.io.ByteArrayOutputStream baos = new java.io.ByteArrayOutputStream(); try { // 将图像编码为png ImageIO.write(image, "png", baos); } catch (IOException e) { e.printStackTrace(); } InputStream isa = new ByteArrayInputStream(baos.toByteArray()); // 设置样式为普通文本,可以根据需要调整 run.setBold(false); run.setItalic(false); run.setFontSize(14); run.addPicture(isa, XWPFDocument.PICTURE_TYPE_PNG, "image.png", Units.toEMU(200), Units.toEMU(100)); cursor.toPrevToken(); XmlObject newRunXmlObject = run.getCTR(); if (cursor.toPrevSibling()) { cursor.toEndToken(); XmlCursor newCursor = newRunXmlObject.newCursor(); newCursor.moveXml(startCursor); } else { // 否则,把新Run插入到公式所在位置的开头 cursor.toParent(); cursor.toNextToken(); XmlCursor newCursor = newRunXmlObject.newCursor(); newCursor.moveXml(cursor); } } }
2.3 TableAwareContentHandler 类
static class TableAwareContentHandler extends DefaultHandler { private final OutputStream out; private final Writer writer; private boolean startingElement; private StringBuilder currentCharacters; private final Integer indent; private int nbElements; private boolean firstElement; SimpleContentHandler simpleContentHandler; public TableAwareContentHandler(OutputStream out) { this((OutputStream) out, (Integer) null); } public TableAwareContentHandler(OutputStream out, Integer indent) { this(out, (Writer) null, indent); simpleContentHandler = new SimpleContentHandler(out, indent); } public TableAwareContentHandler(Writer writer, Integer indent) { this((OutputStream) null, writer, indent); simpleContentHandler = new SimpleContentHandler(writer, indent); } private TableAwareContentHandler(OutputStream out, Writer writer, Integer indent) { this.out = out; this.writer = writer; this.currentCharacters = new StringBuilder(); this.indent = indent; this.firstElement = true; } private void doIndentIfNeeded() throws SAXException { if (this.indent != null && !this.firstElement) { StringBuilder content = new StringBuilder("\n"); for (int i = 0; i < this.nbElements; ++i) { for (int j = 0; j < this.indent; ++j) { content.append(' '); } } this.write(content.toString()); } } public final void characters(char[] ch, int start, int length) throws SAXException { if (this.startingElement) { this.write(">"); } this.startingElement = false; for (int i = start; i < start + length; ++i) { char c = ch[i]; this.currentCharacters.append(c); } } protected boolean mustEncodeCharachers() { return true; } protected void flushCharacters(String characters) throws SAXException { this.write(characters); } protected void resetCharacters() { this.currentCharacters.setLength(0); } private void write(String content) throws SAXException { try { if (this.out != null) { this.out.write(content.getBytes()); } else { this.writer.write(content); } } catch (IOException var3) { throw new SAXException(var3); } } public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { if ("th".equals(localName)) { return; } if (this.startingElement) { this.write(">"); } if (this.currentCharacters.length() > 0) { this.flushCharacters(this.currentCharacters.toString()); this.resetCharacters(); } this.doIndentIfNeeded(); if ("table".equals(localName)) { this.write("<"); this.write(localName); this.write(" style=\"border: 1px solid black;\""); } else if ("tr".equals(localName)) { this.write("<"); this.write("tr"); this.write(" style=\"border: 1px solid black;\""); } else if ("td".equals(localName)) { this.write("<"); this.write("td "); this.write(" style=\"border: 1px solid black;\""); } else { this.write("<"); this.write(localName); } int length = attributes.getLength(); if (length > 0) { String attrName = null; String attrValue = null; for (int i = 0; i < length; ++i) { attrName = attributes.getLocalName(i); attrValue = attributes.getValue(i); this.write(" "); this.write(attrName); this.write("=\""); this.write(attrValue); this.write("\""); } } this.startingElement = true; this.firstElement = false; ++this.nbElements; } public void endElement(String uri, String localName, String name) throws SAXException { if ("th".equals(localName)) { return; } --this.nbElements; if (this.currentCharacters.length() > 0) { this.flushCharacters(this.currentCharacters.toString()); this.resetCharacters(); } if (this.startingElement) { this.write("/>"); this.startingElement = false; } else { this.doIndentIfNeeded(); if ("table".equals(localName)) { write("</table>"); } else if ("tr".equals(localName)) { write("</tr>"); } else if ("td".equals(localName)) { write("</td>"); } else { this.write("</"); this.write(localName); this.write(">"); } } } }
标签:latex,docx,String,write,html,poi,new,options,out From: https://www.cnblogs.com/QAZLIU/p/18283742