|
@@ -2,6 +2,8 @@ package com.qizhen.healsphere;
|
|
|
|
|
|
import com.qizhen.healsphere.util.FileCommonUtils;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.apache.poi.hwpf.HWPFDocument;
|
|
|
+import org.apache.poi.hwpf.usermodel.Range;
|
|
|
import org.apache.poi.xwpf.usermodel.*;
|
|
|
|
|
|
import java.io.*;
|
|
@@ -13,8 +15,67 @@ import java.util.regex.Pattern;
|
|
|
public class WordSplitter {
|
|
|
|
|
|
private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s+[^\\n]+";
|
|
|
- public void splitWordDocument(String inputFilePath, String outputDirectory,String pattern) throws IOException {
|
|
|
- System.out.println(inputFilePath+"==="+outputDirectory+"==="+pattern);
|
|
|
+
|
|
|
+ public void splitWordDocument(String inputFilePath, String outputDirectory, String pattern) throws IOException {
|
|
|
+ System.out.println(inputFilePath + "===" + outputDirectory + "===" + pattern);
|
|
|
+ if (inputFilePath.endsWith(".doc")) {
|
|
|
+ processDocFile(inputFilePath, outputDirectory, pattern);
|
|
|
+ } else if (inputFilePath.endsWith(".docx")) {
|
|
|
+ processDocxFile(inputFilePath, outputDirectory, pattern);
|
|
|
+ } else {
|
|
|
+ throw new IllegalArgumentException("Unsupported file format: " + inputFilePath);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void processDocFile(String inputFilePath, String outputDirectory, String pattern) throws IOException {
|
|
|
+ HWPFDocument document = new HWPFDocument(new FileInputStream(inputFilePath));
|
|
|
+ Range range = document.getRange();
|
|
|
+ String text = range.text();
|
|
|
+
|
|
|
+ StringBuilder currentChapter = new StringBuilder();
|
|
|
+ String currentChapterTitle = null;
|
|
|
+ Pattern compile = Pattern.compile(pattern);
|
|
|
+ String[] paragraphs = text.split("\n");
|
|
|
+ for (String paragraph : paragraphs) {
|
|
|
+ Matcher matcher = compile.matcher(paragraph);
|
|
|
+ boolean isAppendTitle = false;
|
|
|
+ if (matcher.find()) {
|
|
|
+ String tempTitle = matcher.group();
|
|
|
+ // 标题一样的话,不保存
|
|
|
+ if (currentChapterTitle == null) {
|
|
|
+ currentChapterTitle = tempTitle;
|
|
|
+ isAppendTitle = true;
|
|
|
+ } else if (currentChapterTitle != null && !trim(currentChapterTitle).equals(trim(tempTitle))) {
|
|
|
+ save(currentChapterTitle, currentChapter.toString(), outputDirectory);
|
|
|
+ currentChapter.setLength(0); // Clear the StringBuilder
|
|
|
+ currentChapterTitle = tempTitle;
|
|
|
+ isAppendTitle = true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ String trimText = trim(paragraph);
|
|
|
+ if (StringUtils.isNotEmpty(paragraph) && (isAppendTitle || !trim(currentChapterTitle).equals(trimText))
|
|
|
+ && !"本章数字资源".equals(trimText)) {
|
|
|
+ currentChapter.append(paragraph).append("\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(StringUtils.isBlank(currentChapterTitle)){
|
|
|
+ currentChapterTitle = getFileName(inputFilePath);
|
|
|
+ }
|
|
|
+ save(currentChapterTitle, currentChapter.toString(), outputDirectory);
|
|
|
+ document.close();
|
|
|
+ }
|
|
|
+
|
|
|
+ private static String getFileName(String inputFilePath) {
|
|
|
+ File inputFile = new File(inputFilePath);
|
|
|
+ String fileNameWithoutExtension = inputFile.getName();
|
|
|
+ int lastDotIndex = fileNameWithoutExtension.lastIndexOf('.');
|
|
|
+ if (lastDotIndex > 0) {
|
|
|
+ return fileNameWithoutExtension.substring(0, lastDotIndex);
|
|
|
+ }
|
|
|
+ return fileNameWithoutExtension;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void processDocxFile(String inputFilePath, String outputDirectory, String pattern) throws IOException {
|
|
|
XWPFDocument document = new XWPFDocument(new FileInputStream(inputFilePath));
|
|
|
StringBuilder currentChapter = new StringBuilder();
|
|
|
String currentChapterTitle = null;
|
|
@@ -43,16 +104,16 @@ public class WordSplitter {
|
|
|
&& !"本章数字资源".equals(trimText)) {
|
|
|
currentChapter.append(text).append("\n");
|
|
|
}
|
|
|
-
|
|
|
- } /*else if (element instanceof XWPFTable) {
|
|
|
- // Handle tables if necessary
|
|
|
- currentChapter.append(element.toString()).append("\n");
|
|
|
- }*/
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(StringUtils.isBlank(currentChapterTitle)){
|
|
|
+ currentChapterTitle = getFileName(inputFilePath);
|
|
|
}
|
|
|
save(currentChapterTitle, currentChapter.toString(), outputDirectory);
|
|
|
document.close();
|
|
|
}
|
|
|
|
|
|
+
|
|
|
private void save(String chapterTitle, String chapterContents, String outputDirectory) throws IOException {
|
|
|
if (StringUtils.isEmpty(chapterTitle)) {
|
|
|
return;
|
|
@@ -78,6 +139,9 @@ public class WordSplitter {
|
|
|
}
|
|
|
|
|
|
private String trim(String text) {
|
|
|
+ if (StringUtils.isEmpty(text)) {
|
|
|
+ return "";
|
|
|
+ }
|
|
|
return text.replaceAll("\\s+", "");
|
|
|
}
|
|
|
|
|
@@ -114,11 +178,68 @@ public class WordSplitter {
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
WordSplitter wordSplitter1 = new WordSplitter();
|
|
|
- String inputFilePath = "E:\\project\\vscode\\急诊与灾难医学(第4版).docx";
|
|
|
- String outputDirectory = "E:\\project\\vscode\\《急诊与灾难医学(第4版)》\\";
|
|
|
+ String rootDirectory = "E:\\急诊科资料\\中华医学期刊数据库"; // 指定根目录
|
|
|
+
|
|
|
+ // 递归查找所有 .doc 和 .docx 文件
|
|
|
+ File rootDir = new File(rootDirectory);
|
|
|
+ if (rootDir.exists() && rootDir.isDirectory()) {
|
|
|
+ findAndProcessFiles(rootDir, wordSplitter1);
|
|
|
+ } else {
|
|
|
+ System.out.println("指定的根目录不存在或不是一个目录: " + rootDirectory);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void findAndProcessFiles(File directory, WordSplitter wordSplitter) {
|
|
|
+ File[] files = directory.listFiles();
|
|
|
+ if (files != null) {
|
|
|
+ for (File file : files) {
|
|
|
+ if (file.isDirectory()) {
|
|
|
+ // 递归处理子目录
|
|
|
+ findAndProcessFiles(file, wordSplitter);
|
|
|
+ } else if (file.isFile() && (file.getName().endsWith(".doc") || file.getName().endsWith(".docx"))) {
|
|
|
+ // 处理 .doc 和 .docx 文件
|
|
|
+ String inputFilePath = file.getAbsolutePath();
|
|
|
+ try {
|
|
|
+ // 获取文件的上一级目录
|
|
|
+ File parentDir = file.getParentFile().getParentFile();
|
|
|
+ if (parentDir == null) {
|
|
|
+ throw new IOException("无法获取文件的上一级目录: " + inputFilePath);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 在上一级目录下创建 trunk 目录
|
|
|
+ File trunkDir = new File(parentDir, "trunk");
|
|
|
+ /* if (trunkDir.exists()) {
|
|
|
+ throw new IOException("目录已存在,无法创建: " + trunkDir.getAbsolutePath());
|
|
|
+ }*/
|
|
|
+
|
|
|
+ // 创建 trunk 目录
|
|
|
+ trunkDir.mkdirs();
|
|
|
+ /* boolean isCreated = trunkDir.mkdirs();
|
|
|
+ if (!isCreated) {
|
|
|
+ throw new IOException("目录创建失败: " + trunkDir.getAbsolutePath());
|
|
|
+ }*/
|
|
|
+
|
|
|
+ // 设置 outputDirectory 为 trunk 目录
|
|
|
+ String outputDirectory = trunkDir.getAbsolutePath() + File.separator;
|
|
|
+
|
|
|
+ // 调用 splitWordDocument 方法处理文件
|
|
|
+ wordSplitter.splitWordDocument(inputFilePath, outputDirectory, ZHANG_PATTERN);
|
|
|
+ } catch (IOException e) {
|
|
|
+ System.err.println("处理文件时发生错误: " + inputFilePath);
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void main1(String[] args) {
|
|
|
+ WordSplitter wordSplitter1 = new WordSplitter();
|
|
|
+ String inputFilePath = "E:\\project\\vscode\\急诊医学(第2版)\\09. 急诊医学(第2版)_701-718.docx";
|
|
|
+ String outputDirectory = "E:\\project\\vscode\\《急诊医学(第2版)》\\";
|
|
|
try {
|
|
|
//removeHeadersAndFooters(inputFilePath, outputFilePath);
|
|
|
- wordSplitter1.splitWordDocument(inputFilePath, outputDirectory,ZHANG_PATTERN);
|
|
|
+ wordSplitter1.splitWordDocument(inputFilePath, outputDirectory, ZHANG_PATTERN);
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
}
|