Java：word替换关键词和转pdf

标签：Java word pos runText paragraph params pdf run 替换

最近有个word替换关键词和转为pdf的需求，网上找了一下午，不是依赖版本出大问题，就是替换关键词时不时多一点或少一点，于是愤而找了个还算能运行的框架，自己修修改改，总算可以成功运行需求了。

替换关键词的难点主要在word的一个机制上：

word会把一段话随机分成很多个run组件，如“地址：杭州上城区婺江路aaa号bb层cccc室”被分为[“地址：”，”杭州上城区婺江路aaa号b“，”b“，”层c“，”ccc“，”室“]，显而易见，关键词可能会被分割成好几个run组件，因此不能遍历每个run来替换关键词。

假如获取一整个段落里的文字，替换关键词后放进新的run里，又会丢失原来的文字格式，所以也不能直接对一整个段落进行简单的字符串替换。

我的方法是先获取整个段落里的文字，判断是否有关键词存在，若存在，则获取关键词位置，特殊处理关键词开头和结尾所在的run，中间的可以直接删除，这样就达到了替换关键词的目的。

关键代码

public static <V> void replace(XWPFParagraph paragraph, String searchText, V replacement) {
        int firstPos = paragraph.getText().indexOf(searchText);
        while (firstPos >= 0) {
            int lastPos = firstPos + searchText.toString().length();
            // 第一个和最后一个run要单独处理，中间的run可以直接删除
            int pos = 0;
            boolean find = false;
            for (int i = 0, ln = paragraph.getRuns().size(); i < ln; i++) {
                XWPFRun run = paragraph.getRuns().get(i);
                String runText = run.getText(run.getTextPosition());
                if (runText == null) {
                    continue;
                }
                if (!find) {
                    if (pos + runText.length() > firstPos) {
                        // 这里的第二个参数pos=0,替换run的子句; pos=1,在子句末端追加文字;所以这个参数感觉没有位置的意思啊，为什么也叫pos？
                        run.setText(runText.substring(0, firstPos - pos) + replacement.toString(), 0);
                        find = true;
                    }
                    pos += runText.length();
                } else {
                    if (pos > lastPos) {
                        break;
                    } else if (pos <= lastPos - runText.length()) {
                        pos += runText.length();
                        // 不知道为啥，这个删除语句不起效，不过替换成空字符串也能起到删除效果
                        // paragraph.removeRun(i);
                        run.setText("", 0);
                    } else {
                        run.setText(runText.substring(lastPos - pos), 0);
                        break;
                    }
                }
            }
            firstPos = paragraph.getText().indexOf(searchText);
        }
    }

依赖

<dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi</artifactId>
      <version>3.8</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.lowagie/itext -->
    <dependency>
      <groupId>com.lowagie</groupId>
      <artifactId>itext</artifactId>
      <version>2.1.7</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/commons-collections/commons-collections -->
    <dependency>
      <groupId>commons-collections</groupId>
      <artifactId>commons-collections</artifactId>
      <version>3.2.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/fr.opensagres.xdocreport.itext.extension -->
    <dependency>
      <groupId>fr.opensagres.xdocreport</groupId>
      <artifactId>fr.opensagres.xdocreport.itext.extension</artifactId>
      <version>1.0.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.poi/ooxml-schemas -->
    <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>ooxml-schemas</artifactId>
      <version>1.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.core -->
    <dependency>
      <groupId>fr.opensagres.xdocreport</groupId>
      <artifactId>org.apache.poi.xwpf.converter.core</artifactId>
      <version>1.0.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/org.apache.poi.xwpf.converter.pdf -->
    <dependency>
      <groupId>fr.opensagres.xdocreport</groupId>
      <artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
      <version>1.0.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-api -->
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
      <version>1.7.25</version>
    </dependency>

  </dependencies>

核心代码

/**
     * 将word文档， 转换成pdf, 中间替换掉变量
     *
     * @param source  源为word文档，必须为docx文档，可以先把doc另存为docx
     * @param target  目标输出
     * @param params  需要替换的变量
     * @param options PdfOptions.create().fontEncoding( "windows-1250" ) 或者其他
     * @throws Exception
     */
    public static void wordConverterToPdf(InputStream source, OutputStream target, PdfOptions options, Map<String, String> params) throws Exception {
        XWPFDocument doc = new XWPFDocument(source);
        replace(doc, params);
        PdfConverter.getInstance().convert(doc, target, options);
    }

    public static <V> void replace(XWPFDocument document, Map<String, V> params) {
        for (XWPFParagraph xwpfParagraph : document.getParagraphs()) {
            replace(xwpfParagraph, params);
        }
        for (XWPFTable table : document.getTables()) {
            for (XWPFTableRow row : table.getRows()) {
                for (XWPFTableCell cell : row.getTableCells()) {
                    for (XWPFParagraph xwpfParagraph : cell.getParagraphs()) {
                        replace(xwpfParagraph, params);
                    }
                }
            }
        }
    }

    public static <V> void replace(XWPFDocument document, String searchText, V replacement) {
        List<XWPFParagraph> paragraphs = document.getParagraphs();
        for (XWPFParagraph paragraph : paragraphs) {
            replace(paragraph, searchText, replacement);
        }
    }

    private static <V> void replace(XWPFParagraph paragraph, Map<String, V> map) {
        for (Map.Entry<String, V> entry : map.entrySet()) {
            replace(paragraph, entry.getKey(), entry.getValue());
        }
    }

    public static <V> void replace(XWPFParagraph paragraph, String searchText, V replacement) {
        int firstPos = paragraph.getText().indexOf(searchText);
        while (firstPos >= 0) {
            int lastPos = firstPos + searchText.toString().length();
            // 第一个和最后一个run要单独处理，中间的run可以直接删除
            int pos = 0;
            boolean find = false;
            for (int i = 0, ln = paragraph.getRuns().size(); i < ln; i++) {
                XWPFRun run = paragraph.getRuns().get(i);
                String runText = run.getText(run.getTextPosition());
                if (runText == null) {
                    continue;
                }
                if (!find) {
                    if (pos + runText.length() > firstPos) {
                        // pos=0,替换run的子句; pos=1,在子句末端追加文字;
                        run.setText(runText.substring(0, firstPos - pos) + replacement.toString(), 0);
                        find = true;
                    }
                    pos += runText.length();
                } else {
                    if (pos > lastPos) {
                        break;
                    } else if (pos <= lastPos - runText.length()) {
                        pos += runText.length();
                        // 不知道为啥，这个删除语句不起效，不过替换成空字符串也能起到删除效果
                        // paragraph.removeRun(i);
                        run.setText("", 0);
                    } else {
                        run.setText(runText.substring(lastPos - pos), 0);
                        break;
                    }
                }
            }
            firstPos = paragraph.getText().indexOf(searchText);
        }
    }

    /**
     * 打印run里的字符位置,这个方法只是debug时为了参考字符位置写的
     * @param paragraph
     * @return
     */
    private static Map<Integer, XWPFRun> getPosToRuns(XWPFParagraph paragraph) {
        int pos = 0;
        Map<Integer, XWPFRun> map = new HashMap<Integer, XWPFRun>(10);
        for (XWPFRun run : paragraph.getRuns()) {
            String runText = run.getText(run.getTextPosition());
            if (runText != null) {
                for (int i = 0; i < runText.length(); i++) {
                    map.put(pos + i, run);
                }
                pos += runText.length();
            }
        }
        System.out.println(map);
        return map;
    }

运行函数

    public static void main(String[] args) throws Exception {
        final String url = "D:/";
        String filePath = url + "/test.docx";
        String outPath = url + "test.pdf";
        InputStream source = new FileInputStream(filePath);
        OutputStream target = new FileOutputStream(outPath);
        Map<String, String> params = new HashMap<String, String>();
        params.put("替换区1", "测试替换变量1a");
        params.put("替换区2", "测试替换变量2a");
        params.put("替换区3", "测试替换变量3a");
        params.put("替换区4", "测试替换变量4a");

        PdfOptions options = PdfOptions.create();
        wordConverterToPdf(source, target, options, params);
    }

参考

https://github.com/comradeWong/WordToPDFDemo.git

替换Apache POI XWPF中的文本-Java 学习之路 (javaroad.cn)

标签：Java,word,pos,runText,paragraph,params,pdf,run,替换
From： https://www.cnblogs.com/yiyuzi/p/16714860.html

Java：word替换关键词和转pdf

关键代码

依赖

核心代码

运行函数

参考

相关文章

赞助商

阅读排行