要比较word文档内容,我们需要先读取word文档,这里使用poi库,至于比较内容,可以使用apache的commons-text库
引入依赖
<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>4.1.1</version>
</dependency>
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>4.1.1</version>
</dependency>
<dependency><groupId>org.apache.commons</groupId><artifactId>commons-text</artifactId><version>1.11.0</version>
</dependency>
这边要注意下你使用的commons-text的版本,它的api有很大的调整,我使用的版本为1.11.0
实现输出新增和删除内容
你可以使用StringsComparator类来实现文本内容的比较,这里面使用了访问者模式,StringsComparator提供了哪些文本保留了,哪些文本删除了,而由你去提供访问者来实现想要的效果,比如这个例子就是输出新增和删除的内容
效果:
import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;public class DocTest {@Testpublic void testCompare() {try {// 读取word文档XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));// 获取文档文本内容XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);String content1 = extractor1.getText();XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);String content2 = extractor2.getText();// 关闭流doc1.close();doc2.close();// commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0StringsComparator comparator = new StringsComparator(content1, content2);EditScript<Character> script = comparator.getScript();ChangedCommandVisitor commandVisitor = new ChangedCommandVisitor();script.visit(commandVisitor);commandVisitor.finish();List<ChangedWords> changedWordsList = commandVisitor.getChangedWordsList();System.out.println("******变更内容******");for (int i = 0; i < changedWordsList.size(); i++) {ChangedWords changedWords = changedWordsList.get(i);String operator = changedWords.getType() == 0 ? "新增" : "删除";System.out.println("#" + (i + 1) + operator + ": " + changedWords.getWords());}} catch (Exception e) {e.printStackTrace();}}@Data@AllArgsConstructorstatic class ChangedWords {private String words;private int type;//0:insert,1:delete}// 获取变化内容static class ChangedCommandVisitor implements CommandVisitor<Character> {private List<ChangedWords> changedWordsList = new ArrayList<>();private StringBuilder temp = new StringBuilder();private int lastTag = 0; //0:keep,1:insert,2:delete@Overridepublic void visitDeleteCommand(Character object) {if (lastTag == 1) {changedWordsList.add(new ChangedWords(temp.toString(), 0));temp.setLength(0);}lastTag = 2;temp.append(object);}@Overridepublic void visitInsertCommand(Character object) {if (lastTag == 2) {changedWordsList.add(new ChangedWords(temp.toString(), 1));temp.setLength(0);}lastTag = 1;temp.append(object);}@Overridepublic void visitKeepCommand(Character object) {finish();}public void finish() {if (lastTag == 1) {changedWordsList.add(new ChangedWords(temp.toString(), 0));temp.setLength(0);} else if (lastTag == 2) {changedWordsList.add(new ChangedWords(temp.toString(), 1));temp.setLength(0);}lastTag = 0;}public List<ChangedWords> getChangedWordsList() {return changedWordsList;}}
}
实现在源文本上标记修改
输出的内容是html,可直接在网页里面显示,自己加点样式就可以实现不同的显示效果
效果:
package com.wkt.server;import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;public class DocTest {@Testpublic void testCompare() {try {// 读取word文档XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));// 获取文档文本内容XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);String content1 = extractor1.getText();XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);String content2 = extractor2.getText();// 关闭流doc1.close();doc2.close();// commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0StringsComparator comparator = new StringsComparator(content1, content2);EditScript<Character> script = comparator.getScript();TextChangedCommandVisitor commandVisitor = new TextChangedCommandVisitor();script.visit(commandVisitor);commandVisitor.finish();System.out.println(commandVisitor.getContent());} catch (Exception e) {e.printStackTrace();}}// 源文本上显示变化内容static class TextChangedCommandVisitor implements CommandVisitor<Character> {private StringBuilder content = new StringBuilder();private int lastTag = 0; //0:keep,1:insert,2:deleteprivate String insertStart = "<em>";private String insertEnd = "</em>";private String deleteStart = "<del>";private String deleteEnd = "</del>";@Overridepublic void visitDeleteCommand(Character object) {if (lastTag == 1) {content.append(insertEnd);content.append(deleteStart);} else if (lastTag == 0) {content.append(deleteStart);}content.append(object);lastTag = 2;}@Overridepublic void visitInsertCommand(Character object) {if (lastTag == 2) {content.append(deleteEnd);content.append(insertStart);} else if (lastTag == 0) {content.append(insertStart);}content.append(object);lastTag = 1;}@Overridepublic void visitKeepCommand(Character object) {finish();content.append(object);}public void finish() {if (lastTag == 1) {content.append(insertEnd);} else if (lastTag == 2) {content.append(deleteEnd);}lastTag = 0;}public StringBuilder getContent() {return content;}}
}