关键词匹配实体类:
@Data
@EqualsAndHashCode(callSuper = false)
public class TextConfig implements Serializable {private static final long serialVersionUID = 1L;/*** 开始关键词,多个逗号分隔*/private String textStart ;/*** 结束关键词,多个逗号分隔*/private String textEnd ;/*** 包含关键词,多个逗号分隔*/private String textInclude ;/*** 不包含关键词,多个逗号分隔*/private String textExclude ;}
import com.aspose.words.*;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.*;@EqualsAndHashCode(callSuper = false)
@Slf4j
@Data
public class WordResolve extends Document {/*** 查找文本类型另存为word* @param filePathName 文件保存路径* @param copyFirst 是否复制关键词开始节点* @param copyLast 是否复制关键词结束节点*/@SneakyThrowspublic File findBetweenFile(TextConfig textConfig, String filePathName, boolean copyFirst, boolean copyLast) {List<Paragraph> paragraphs = getAllParagraph();Integer[] sec = findBetweenIndex(textConfig,getAllText());if(sec == null){return null;}Document doc = new Document();Body body = doc.getFirstSection().getBody();body.removeAllChildren();NodeImporter importer = new NodeImporter(this, doc, ImportFormatMode.KEEP_SOURCE_FORMATTING);Paragraph first = paragraphs.get(sec[0]);Paragraph last = paragraphs.get(sec[1]);List<CompositeNode<?>> parentNodes = new ArrayList<>(20);boolean startCopying = false;//读取文档的所有节点NodeCollection<?> allNodeList = this.getChildNodes(NodeType.ANY, true);for (int i = 0, j = allNodeList.getCount(); i < j; i++) {Node node = allNodeList.get(i);try{if (node == first) {startCopying = true;if(!copyFirst){continue;}}if (node == last && !copyLast) {// 到达结束节点后停止复制break;}if (startCopying) {boolean append = true;for (CompositeNode<?> parentNode : parentNodes) {NodeCollection<?> childNodes = parentNode.getChildNodes(node.getNodeType(), true);if(childNodes.contains(node)){append = false;break;}}if(append){try{body.appendChild(importer.importNode(node, true));}catch (Exception e){log.error("插入节点出错:{}",e.getMessage());//ignore}}}if (node == last) {// 到达结束节点后停止复制break;}}finally {if(startCopying && node.isComposite()){CompositeNode<?> compositeNode = (CompositeNode<?>) node;if(!parentNodes.contains(compositeNode)){parentNodes.add(compositeNode);}}}}File file = FileUtils.getFile(filePathName);doc.save(filePathName);return file;}/*** 查找文本类型解析规则的开始结束段落索引* @param strings word全部段落,每个段落的文本*/public Integer[] findBetweenIndex(TextConfig textConfig, List<String> strings){String textStart = textConfig.getTextStart();String textEnd = textConfig.getTextEnd();//规定开始关键词必须包含哪些文字,多个逗号分隔(作为附加判断,可为空)textInclude = StringUtil.defaultString(textConfig.getTextInclude(), "").replace(",", "");//规定结束关键词必须不包含哪些文字,多个逗号分隔(作为附加判断,可为空)textExclude = StringUtil.defaultString(textConfig.getTextExclude(), "").replace(",", "");String[] in = StringUtil.isBlank(textInclude) ? null : textInclude.split(",");String[] out = StringUtil.isBlank(textExclude) ? null : textExclude.split(",");//满足开始位置和结束位置的全部关键词索引List<Integer> startArr = new ArrayList<>();List<Integer> endArr = new ArrayList<>();for (int i = 0; i < strings.size(); i++) {String text = strings.get(i);if (text.contains(textStart)) {startArr.add(i);}if (text.contains(textEnd)) {endArr.add(i);}}//进行包含和非包含的判断过滤if(!startArr.isEmpty() && !endArr.isEmpty()){for (Integer start : startArr) {for (Integer end : endArr) {//中间至少隔了一个段落if(start + 1 < end){StringJoiner jo = new StringJoiner("\n");for (int i = start + 1; i < end; i++) {jo.add(strings.get(i));}String word = jo.toString();boolean match = true;if(in != null){for (String s : in) {if(!word.contains(s)){match = false;break;}}}if(match && out != null){for (String s : out) {if(word.contains(s)){match = false;break;}}}if(match){return new Integer[]{start,end};}}}}}return null;}/*** 拿到文档全部段落文本*/public List<String> getAllText(){List<String> strings = new ArrayList<>();getAllParagraph().forEach(a-> strings.add(StringTool.safeToString(a.getText(), "")));return strings;}/*** 拿到文档全部段落*/public List<Paragraph> getAllParagraph(){if(allParagraph == null){allParagraph = findNodeByType(NodeType.PARAGRAPH, Paragraph.class);}return allParagraph;}
}
使用方式:
word文档内容如下:
要截取标题三和标题四之间的内容,生成新的word,代码如下:
public class Test {public static void main(String[] args) throws Exception{//验证license//new AsposeLicense().validate();//读取word文件WordResolve word = new WordResolve(new File("C:\\Users\\zhou\\Desktop\\测试.docx"));TextConfig detail = new TextConfig();detail.setTextStart("poi导出大数据量问题、写入速度慢");detail.setTextEnd("国密验签失败");File betweenFile = word.findBetweenFile(detail, "C:\\Users\\zhou\\Desktop\\betweenFile.docx", false, false);System.out.println(betweenFile);}}
截取保存的文件如下: