2 tuần trước cách đây · 654c125ce5
--- a/pom.xml
+++ b/pom.xml
@@ -215,6 +215,11 @@
 
				             <artifactId>commons-collections4</artifactId>
			
 
				             <version>4.4</version>
			
 
				         </dependency>
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.poi</groupId>
			
 
				+            <artifactId>poi-scratchpad</artifactId>
			
 
				+            <version>5.2.3</version>
			
 
				+        </dependency>
			
 
				 
			
 
				     </dependencies>
			
 
				     <dependencyManagement>
			
--- a/src/main/java/com/qizhen/healsphere/TxtSplitter.java
+++ b/src/main/java/com/qizhen/healsphere/TxtSplitter.java
@@ -14,7 +14,8 @@ import java.util.regex.Pattern;
 
				 
			
 
				 public class TxtSplitter {
			
 
				     private static final int max_leng = 500;
			
 
				-    private static final String JIE_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+节\\s+[^\n]+";
			
 
				+    private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s*[^\\n]+";
			
 
				+    private static final String JIE_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+节\\s*\\|?\\s*[^\n]+";
			
 
				     private static final String TITLE1_PATTERN = "^[零一二三四五六七八九十百千万]+\\s*、\\s*(.*)";
			
 
				     private static final String TITLE2_PATTERN = "^\\（([零一二三四五六七八九十百千万]+)\\）(.*)";
			
 
				     Pattern pageNumberCompile = Pattern.compile("^\\s*(\\.\\s+)*[0-9]+(\\s+\\.\\s*)?\\s*$");
			
@@ -43,6 +44,8 @@ public class TxtSplitter {
 
				         try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
			
 
				             String line;
			
 
				             while ((line = reader.readLine()) != null) {
			
 
				+                line = trim( line);
			
 
				+
			
 
				                 if (StringUtils.isNotEmpty(line) &&  pageNumberCompile.matcher(line).find()) {
			
 
				                     continue;
			
 
				                 }
			
@@ -71,11 +74,13 @@ public class TxtSplitter {
 
				         File outputFile = new File(outputDirectory, fileName);
			
 
				 
			
 
				         try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
			
 
				-            String s = inputFile.getParent().replace("E:\\project\\vscode\\", "").replaceAll("\\\\", " ，");
			
 
				+            String s = inputFile.getParent().replace(inputDirectoryPath, "").replaceAll("\\\\", "，");
			
 
				             if(StringUtils.isNotEmpty(title)){
			
 
				-                writer.write("本切片内容来自:"+s+"，"+title+"\n");
			
 
				+                //writer.write("本切片内容来自:"+s+"，"+title+"\n");
			
 
				+                writer.write("本切片内容来自:"+s+"，"+title+"。");
			
 
				             }else{
			
 
				-                writer.write("本切片内容来自:"+s+"\n");
			
 
				+                //writer.write("本切片内容来自:"+s+"\n");
			
 
				+                writer.write("本切片内容来自:"+s+"。");
			
 
				             }
			
 
				             writer.write(merge.toString());
			
 
				 
			
@@ -116,16 +121,18 @@ public class TxtSplitter {
 
				     }
			
 
				 
			
 
				     private String trim(String text) {
			
 
				+        // 去除所有空白字符
			
 
				         return text.replaceAll("\\s+", "");
			
 
				     }
			
 
				 
			
 
				     public static void main(String[] args) {
			
 
				+        //split(false,ZHANG_PATTERN);
			
 
				         split(false,JIE_PATTERN);
			
 
				         split(false,TITLE1_PATTERN);
			
 
				         split(true,TITLE2_PATTERN);
			
 
				     }
			
 
				+    public static final  String inputDirectoryPath = "E:\\打标资料\\"; // 修改为目录路径
			
 
				     private static void split(boolean split,String pattern) {
			
 
				-        String inputDirectoryPath = "E:\\project\\vscode\\《急诊与灾难医学（第4版）》\\"; // 修改为目录路径
			
 
				         //String inputDirectoryPath = "E:\\project\\vscode\\《急诊与灾难医学（第4版）》\\"; // 修改为目录路径
			
 
				         File inputDirectory = new File(inputDirectoryPath);
			
 
				         traverse(inputDirectory, split,pattern);
			
@@ -200,64 +207,112 @@ public class TxtSplitter {
 
				         try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
			
 
				             String line;
			
 
				             while ((line = reader.readLine()) != null) {
			
 
				+                line = trim(line);
			
 
				                 Matcher matcher = compile.matcher(line);
			
 
				                 if (matcher.find()) {
			
 
				-                    //找到了新的一段，且上一段内容超过了最大长度的1/4则保存，并清空
			
 
				+                    // 找到了新的一段，且上一段内容超过了最大长度的1/4则保存，并清空
			
 
				                     if (currentParagraph.length() > 0) {
			
 
				                         paragraphs.add(currentParagraph.toString());
			
 
				                         currentParagraph.setLength(0); // Clear the StringBuilder
			
 
				                     }
			
 
				                 }
			
 
				-                currentParagraph.append(line).append("\n");
			
 
				+                currentParagraph.append(line);
			
 
				+                currentParagraph.append("\n");
			
 
				             }
			
 
				             if (currentParagraph.length() > 0) {
			
 
				                 paragraphs.add(currentParagraph.toString());
			
 
				             }
			
 
				         }
			
 
				+
			
 
				+        // 对paragraphs中的每个段落进行处理，确保长度不超过max_leng
			
 
				+        List<String> processedParagraphs = new ArrayList<>();
			
 
				+        for (String paragraph : paragraphs) {
			
 
				+            if (paragraph.length() > max_leng) {
			
 
				+                // 如果段落长度超过max_leng，则分割并添加到processedParagraphs
			
 
				+                processedParagraphs.addAll(splitParagraph(paragraph, max_leng));
			
 
				+            } else {
			
 
				+                // 如果段落长度未超过max_leng，则直接添加
			
 
				+                processedParagraphs.add(paragraph);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				         StringBuilder merge = new StringBuilder();
			
 
				         int i = 0;
			
 
				         String lastTitle = "";
			
 
				         String title = "";
			
 
				-        for (int j=0;j< paragraphs.size();j++) {
			
 
				-            String temp = paragraphs.get(j);
			
 
				+        for (int j = 0; j < processedParagraphs.size(); j++) {
			
 
				+            String temp = processedParagraphs.get(j);
			
 
				             Matcher matcher = compile.matcher(temp);
			
 
				 
			
 
				             boolean finded = matcher.find();
			
 
				-            if(finded){
			
 
				+            if (finded) {
			
 
				                 lastTitle = title;
			
 
				                 title = matcher.group();
			
 
				             }
			
 
				+
			
 
				+            // 如果merge+temp会超过max_leng，则先保存merge
			
 
				             if (merge.length() + temp.length() > max_leng) {
			
 
				-                //因为merge没有超过最大长度的1/3，说明是temp超过最大长度的2/3，所以切割temp
			
 
				-                //System.out.println(temp);
			
 
				-                List<String> sentences = SentenceSplitter.splitSentences(temp);
			
 
				-                for (String sentence : sentences) {
			
 
				-                    if ((merge.length() + sentence.length()) > max_leng) {
			
 
				-                        saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,title);
			
 
				-                        merge.append(sentence);
			
 
				-                    } else {
			
 
				-                        merge.append(sentence);
			
 
				-                    }
			
 
				-                }
			
 
				-                merge.append("\n");
			
 
				-            } else {
			
 
				-                boolean onlyTitle = finded && trim(temp).equals(title);
			
 
				-                //如果temp是标题，且merge+temp超过最大长度的1/3，则merge保存，再追加temp，以保证标题不会出现在切片的末尾
			
 
				-                if (onlyTitle && (merge.length() + temp.length()) > max_leng / 3) {
			
 
				-                    saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
			
 
				-                }
			
 
				-                if(temp.length()>merge.length()){
			
 
				-                    lastTitle = title;
			
 
				+
			
 
				+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge, StringUtils.isNotEmpty(title) ? title : lastTitle);
			
 
				+                merge.setLength(0); // 清空merge
			
 
				+            }
			
 
				+
			
 
				+            // 添加当前段落到merge
			
 
				+            merge.append(temp);
			
 
				+
			
 
				+            // 如果merge接近max_leng的一半，考虑提前保存以保持均衡
			
 
				+            if (merge.length() > max_leng / 2 && !temp.equals(title)) {
			
 
				+
			
 
				+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge, StringUtils.isNotEmpty(title) ? title : lastTitle);
			
 
				+                merge.setLength(0); // 清空merge
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // 最后一次保存剩余内容
			
 
				+        if (merge.length() > 0) {
			
 
				+
			
 
				+            saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge, StringUtils.isNotEmpty(title) ? title : lastTitle);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // 优化后的辅助方法：将长段落按指定长度分割，基于句子进行处理
			
 
				+    private List<String> splitParagraph(String paragraph, int maxLength) {
			
 
				+        List<String> result = new ArrayList<>();
			
 
				+        List<String> sentences = SentenceSplitter.splitSentences(paragraph); // 将段落分割为句子
			
 
				+        StringBuilder currentSegment = new StringBuilder();
			
 
				+
			
 
				+        for (String sentence : sentences) {
			
 
				+            if (currentSegment.length() + sentence.length() > maxLength) {
			
 
				+                // 如果添加当前句子会超出最大长度，则保存当前段落并清空
			
 
				+                if (currentSegment.length() > 0) {
			
 
				+                    result.add(currentSegment.toString());
			
 
				                 }
			
 
				-                merge.append(temp);
			
 
				+                currentSegment.setLength(0);
			
 
				             }
			
 
				+            currentSegment.append(sentence);
			
 
				+        }
			
 
				+
			
 
				+        // 最后一次保存剩余内容
			
 
				+        if (currentSegment.length() > 0) {
			
 
				+            result.add(currentSegment.toString());
			
 
				+        }
			
 
				 
			
 
				-            //超过最大长度的1/3则直接保存或是最后的片段
			
 
				-            if (merge.length() > max_leng / 3) {
			
 
				-                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
			
 
				-            }else if(j==paragraphs.size()-1){
			
 
				-                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,StringUtils.isNotEmpty(title)?title:lastTitle);
			
 
				+        // 检查结果列表中的每个段落是否超过maxLength，若超过则强制分割
			
 
				+        List<String> finalResult = new ArrayList<>();
			
 
				+        for (String segment : result) {
			
 
				+            if (segment.length() > maxLength) {
			
 
				+                // 强制分割超长段落
			
 
				+                int start = 0;
			
 
				+                while (start < segment.length()) {
			
 
				+                    finalResult.add(segment.substring(start, Math.min(start + maxLength, segment.length())));
			
 
				+                    start += maxLength;
			
 
				+                }
			
 
				+            } else {
			
 
				+                finalResult.add(segment);
			
 
				             }
			
 
				         }
			
 
				+
			
 
				+        return finalResult;
			
 
				     }
			
 
				+
			
 
				 }
			
--- a/src/main/java/com/qizhen/healsphere/WordSplitter.java
+++ b/src/main/java/com/qizhen/healsphere/WordSplitter.java
@@ -2,6 +2,8 @@ package com.qizhen.healsphere;
 
				 
			
 
				 import com.qizhen.healsphere.util.FileCommonUtils;
			
 
				 import org.apache.commons.lang3.StringUtils;
			
 
				+import org.apache.poi.hwpf.HWPFDocument;
			
 
				+import org.apache.poi.hwpf.usermodel.Range;
			
 
				 import org.apache.poi.xwpf.usermodel.*;
			
 
				 
			
 
				 import java.io.*;
			
@@ -13,9 +15,23 @@ import java.util.regex.Pattern;
 
				 public class WordSplitter {
			
 
				 
			
 
				     private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s+[^\\n]+";
			
 
				-    public void splitWordDocument(String inputFilePath, String outputDirectory,String pattern) throws IOException {
			
 
				-        System.out.println(inputFilePath+"==="+outputDirectory+"==="+pattern);
			
 
				-        XWPFDocument document = new XWPFDocument(new FileInputStream(inputFilePath));
			
 
				+
			
 
				+    public void splitWordDocument(String inputFilePath, String outputDirectory, String pattern) throws IOException {
			
 
				+        System.out.println(inputFilePath + "===" + outputDirectory + "===" + pattern);
			
 
				+        
			
 
				+        // 根据文件扩展名选择合适的处理方式
			
 
				+        if (inputFilePath.toLowerCase().endsWith(".docx")) {
			
 
				+            XWPFDocument document = new XWPFDocument(new FileInputStream(inputFilePath));
			
 
				+            processXWPFDocument(document, outputDirectory, pattern);
			
 
				+        } else if (inputFilePath.toLowerCase().endsWith(".doc")) {
			
 
				+            HWPFDocument document = new HWPFDocument(new FileInputStream(inputFilePath));
			
 
				+            processHWPFDocument(document, outputDirectory, pattern);
			
 
				+        } else {
			
 
				+            throw new IllegalArgumentException("Unsupported file format: " + inputFilePath);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private void processXWPFDocument(XWPFDocument document, String outputDirectory, String pattern) throws IOException {
			
 
				         StringBuilder currentChapter = new StringBuilder();
			
 
				         String currentChapterTitle = null;
			
 
				         Pattern compile = Pattern.compile(pattern);
			
@@ -43,23 +59,53 @@ public class WordSplitter {
 
				                         && !"本章数字资源".equals(trimText)) {
			
 
				                     currentChapter.append(text).append("\n");
			
 
				                 }
			
 
				-
			
 
				-            } /*else if (element instanceof XWPFTable) {
			
 
				-                // Handle tables if necessary
			
 
				-                currentChapter.append(element.toString()).append("\n");
			
 
				-            }*/
			
 
				+            } 
			
 
				         }
			
 
				         save(currentChapterTitle, currentChapter.toString(), outputDirectory);
			
 
				         document.close();
			
 
				     }
			
 
				 
			
 
				+    private void processHWPFDocument(HWPFDocument document, String outputDirectory, String pattern) throws IOException {
			
 
				+        Range range = document.getRange();
			
 
				+        StringBuilder currentChapter = new StringBuilder();
			
 
				+        String currentChapterTitle = null;
			
 
				+        Pattern compile = Pattern.compile(pattern);
			
 
				+
			
 
				+        for (int i = 0; i < range.numParagraphs(); i++) {
			
 
				+            String text = range.getParagraph(i).text();
			
 
				+            Matcher matcher = compile.matcher(text);
			
 
				+            boolean isAppendTitle = false;
			
 
				+            if (matcher.find()) {
			
 
				+                String tempTitle = matcher.group();
			
 
				+                //标题一样的话，不保存
			
 
				+                if (currentChapterTitle == null) {
			
 
				+                    currentChapterTitle = tempTitle;
			
 
				+                    isAppendTitle = true;
			
 
				+                } else if (currentChapterTitle != null && !trim(currentChapterTitle).equals(trim(tempTitle))) {
			
 
				+                    save(currentChapterTitle, currentChapter.toString(), outputDirectory);
			
 
				+                    currentChapter.setLength(0); // Clear the StringBuilder
			
 
				+                    currentChapterTitle = tempTitle;
			
 
				+                    isAppendTitle = true;
			
 
				+                }
			
 
				+            }
			
 
				+            String trimText = trim(text);
			
 
				+            if (StringUtils.isNotEmpty(text) && (isAppendTitle || !trim(currentChapterTitle).equals(trimText))
			
 
				+                    && !"本章数字资源".equals(trimText)) {
			
 
				+                currentChapter.append(text).append("\n");
			
 
				+            }
			
 
				+        }
			
 
				+        save(currentChapterTitle, currentChapter.toString(), outputDirectory);
			
 
				+    }
			
 
				+
			
 
				     private void save(String chapterTitle, String chapterContents, String outputDirectory) throws IOException {
			
 
				         if (StringUtils.isEmpty(chapterTitle)) {
			
 
				-            return;
			
 
				+            File outputDir = new File(outputDirectory);
			
 
				+            chapterTitle = outputDir.getName();
			
 
				+            //return;
			
 
				         }
			
 
				         String fileName = FileCommonUtils.sanitizeFileName(chapterTitle);
			
 
				         String fileNameEXt = fileName + ".txt";
			
 
				-        outputDirectory += fileName + "\\";
			
 
				+        //outputDirectory += fileName + "\\";
			
 
				         File outputFile = new File(outputDirectory, fileNameEXt);
			
 
				         // 检查目录是否存在，如果不存在则创建目录
			
 
				         File directory = new File(outputDirectory);
			
@@ -78,6 +124,9 @@ public class WordSplitter {
 
				     }
			
 
				 
			
 
				     private String trim(String text) {
			
 
				+        if (StringUtils.isBlank(text)) {
			
 
				+            return "";
			
 
				+        }
			
 
				         return text.replaceAll("\\s+", "");
			
 
				     }
			
 
				 
			
@@ -114,13 +163,40 @@ public class WordSplitter {
 
				 
			
 
				     public static void main(String[] args) {
			
 
				         WordSplitter wordSplitter1 = new WordSplitter();
			
 
				-        String inputFilePath = "E:\\project\\vscode\\急诊与灾难医学（第4版）.docx";
			
 
				-        String outputDirectory = "E:\\project\\vscode\\《急诊与灾难医学（第4版）》\\";
			
 
				-        try {
			
 
				-            //removeHeadersAndFooters(inputFilePath, outputFilePath);
			
 
				-            wordSplitter1.splitWordDocument(inputFilePath, outputDirectory,ZHANG_PATTERN);
			
 
				-        } catch (IOException e) {
			
 
				-            e.printStackTrace();
			
 
				+        String baseDirectory = "E:\\打标资料\\"; // 指定要遍历的目录
			
 
				+        File directory = new File(baseDirectory);
			
 
				+
			
 
				+        if (directory.exists() && directory.isDirectory()) {
			
 
				+            processDirectory(directory, wordSplitter1);
			
 
				+        } else {
			
 
				+            System.out.println("指定的目录不存在或不是一个有效目录: " + baseDirectory);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private static void processDirectory(File directory, WordSplitter wordSplitter) {
			
 
				+        File[] files = directory.listFiles();
			
 
				+        if (files == null) {
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        for (File file : files) {
			
 
				+            if (file.isDirectory()) {
			
 
				+                // 递归处理子目录
			
 
				+                processDirectory(file, wordSplitter);
			
 
				+            } else if (file.isFile() && (file.getName().toLowerCase().endsWith(".docx")|| file.getName().toLowerCase().endsWith(".doc"))) {
			
 
				+                // 处理 .docx 文件
			
 
				+                String inputFilePath = file.getAbsolutePath();
			
 
				+                String fileNameWithoutExtension = file.getName().replaceFirst("[.][^.]+$", "");
			
 
				+                String outputDirectory = directory.getAbsolutePath() + "\\" + fileNameWithoutExtension + "\\";
			
 
				+                
			
 
				+                try {
			
 
				+                    wordSplitter.splitWordDocument(inputFilePath, outputDirectory, ZHANG_PATTERN);
			
 
				+                    System.out.println("处理完成: " + inputFilePath);
			
 
				+                } catch (IOException e) {
			
 
				+                    System.err.println("处理文件时出错: " + inputFilePath);
			
 
				+                    e.printStackTrace();
			
 
				+                }
			
 
				+            }
			
 
				         }
			
 
				     }
			
 
				 }
			
--- a/src/main/java/com/qizhen/healsphere/zhinanSplitter.java
+++ b/src/main/java/com/qizhen/healsphere/zhinanSplitter.java
@@ -0,0 +1,292 @@
 
				+package com.qizhen.healsphere;
			
 
				+
			
 
				+import com.qizhen.healsphere.util.FileCommonUtils;
			
 
				+import com.qizhen.healsphere.util.SentenceSplitter;
			
 
				+import org.apache.commons.lang3.StringUtils;
			
 
				+
			
 
				+import java.io.*;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+import java.util.regex.Matcher;
			
 
				+import java.util.regex.Pattern;
			
 
				+
			
 
				+public class zhinanSplitter {
			
 
				+    private static final int max_leng = 500;
			
 
				+    private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s*[^\\n]+";
			
 
				+    private static final String JIE_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+节\\s*\\|?\\s*[^\n]+";
			
 
				+    private static final String TITLE1_PATTERN = "^[零一二三四五六七八九十百千万]+\\s*、\\s*(.*)";
			
 
				+    private static final String TITLE2_PATTERN = "^\\（([零一二三四五六七八九十百千万]+)\\）(.*)";
			
 
				+    Pattern pageNumberCompile = Pattern.compile("^\\s*(\\.\\s+)*[0-9]+(\\s+\\.\\s*)?\\s*$");
			
 
				+    public void splitTxtFile(String inputFilePath, String outputDirectory, String pattern) throws IOException {
			
 
				+        File inputFile = new File(inputFilePath);
			
 
				+        if (!inputFile.exists()) {
			
 
				+            System.out.println("输入文件不存在: " + inputFilePath);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        // 检查目录是否存在，如果不存在则创建目录
			
 
				+        File outputDir = new File(outputDirectory);
			
 
				+        if (!outputDir.exists()) {
			
 
				+            boolean isCreated = outputDir.mkdirs();
			
 
				+            if (isCreated) {
			
 
				+                //System.out.println("目录已创建: " + outputDirectory);
			
 
				+            } else {
			
 
				+                //System.out.println("目录创建失败: " + outputDirectory);
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        StringBuilder currentChapter = new StringBuilder();
			
 
				+        String currentChapterTitle = null;
			
 
				+        Pattern compile = Pattern.compile(pattern);
			
 
				+        try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
			
 
				+            String line;
			
 
				+            while ((line = reader.readLine()) != null) {
			
 
				+                line = trim( line);
			
 
				+
			
 
				+                if (StringUtils.isNotEmpty(line) &&  pageNumberCompile.matcher(line).find()) {
			
 
				+                    continue;
			
 
				+                }
			
 
				+                Matcher matcher = compile.matcher(line);
			
 
				+                if (matcher.find()) {
			
 
				+                    String tempTitle = matcher.group();
			
 
				+                    if (currentChapterTitle == null) {
			
 
				+                        currentChapterTitle = tempTitle;
			
 
				+                        // 标题一样的话，不保存
			
 
				+                    } else if (currentChapterTitle != null && !trim(currentChapterTitle).equals(trim(tempTitle))) {
			
 
				+                        save(currentChapterTitle, currentChapter.toString(), outputDirectory);
			
 
				+                        currentChapter.setLength(0); // Clear the StringBuilder
			
 
				+                        currentChapterTitle = tempTitle;
			
 
				+                    }
			
 
				+                }
			
 
				+                if (StringUtils.isNotEmpty(line)) {
			
 
				+                    currentChapter.append(line).append("\n");
			
 
				+                }
			
 
				+            }
			
 
				+            save(currentChapterTitle, currentChapter.toString(), outputDirectory);
			
 
				+        }
			
 
				+    }
			
 
				+    public static final String appId = "e1de7dfc-afdb-48e4-b0f8-4fcecea09643";
			
 
				+    private static int saveTxt(String inputFilePath, String outputDirectory, int i, File inputFile, StringBuilder merge,String title) throws IOException {
			
 
				+        String fileName = FileCommonUtils.getFileNameWithoutExtension(inputFilePath) + "_split_" + i + ".txt";
			
 
				+        File outputFile = new File(outputDirectory, fileName);
			
 
				+
			
 
				+        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
			
 
				+            String s = inputFile.getParent().replace(inputDirectoryPath, "").replaceAll("\\\\", "，");
			
 
				+            if(StringUtils.isNotEmpty(title)){
			
 
				+                //writer.write("本切片内容来自:"+s+"，"+title+"\n");
			
 
				+                writer.write("本切片内容来自:"+s+"，"+title+"。");
			
 
				+            }else{
			
 
				+                //writer.write("本切片内容来自:"+s+"\n");
			
 
				+                writer.write("本切片内容来自:"+s+"。");
			
 
				+            }
			
 
				+            writer.write(merge.toString());
			
 
				+
			
 
				+          /*  String response = QizhenAssistant.call(appId, QizhenAssistant.getConversationId(appId), merge.toString());
			
 
				+            String answer = JSONObject.parseObject(response).getString("answer");
			
 
				+            writer.write("\n\n"+answer);*/
			
 
				+            merge.setLength(0);
			
 
				+        }
			
 
				+        return i;
			
 
				+    }
			
 
				+
			
 
				+    private void save(String chapterTitle, String chapterContents, String outputDirectory) throws IOException {
			
 
				+        if (StringUtils.isEmpty(chapterTitle)) {
			
 
				+            return;
			
 
				+        }
			
 
				+        String fileName = FileCommonUtils.sanitizeFileName(chapterTitle);
			
 
				+        String fileNameEXt = fileName + ".txt";
			
 
				+        if (!outputDirectory.endsWith("\\")) {
			
 
				+            outputDirectory += "\\";
			
 
				+        }
			
 
				+        outputDirectory += fileName + "\\";
			
 
				+        File outputFile = new File(outputDirectory, fileNameEXt);
			
 
				+        // 检查目录是否存在，如果不存在则创建目录
			
 
				+        File directory = new File(outputDirectory);
			
 
				+        if (!directory.exists()) {
			
 
				+            boolean isCreated = directory.mkdirs();
			
 
				+            if (isCreated) {
			
 
				+                //System.out.println("目录已创建: " + outputDirectory);
			
 
				+            } else {
			
 
				+                //System.out.println("目录创建失败: " + outputDirectory);
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
			
 
				+            writer.write(chapterContents);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    private String trim(String text) {
			
 
				+        // 去除所有空白字符
			
 
				+        text = text.replaceAll("\\s+", "");
			
 
				+
			
 
				+        // 修正：只移除英文标点符号，保留中文内容
			
 
				+        String text2 = text.replaceAll("[^\\u4e00-\\u9fa5a-zA-Z]", "");
			
 
				+
			
 
				+        // 如果文本长度超过10个字符且全是英文，返回空字符串
			
 
				+        if (text2.length() > 10 && text2.matches("^[a-zA-Z]+$")) {
			
 
				+            return "";
			
 
				+        }
			
 
				+
			
 
				+        return text;
			
 
				+    }
			
 
				+
			
 
				+    public static void main(String[] args) {
			
 
				+        //split(false,ZHANG_PATTERN);
			
 
				+        split(false,JIE_PATTERN);
			
 
				+        split(false,TITLE1_PATTERN);
			
 
				+        split(true,TITLE2_PATTERN);
			
 
				+    }
			
 
				+    public static final  String inputDirectoryPath = "E:\\打标资料\\内科学\\"; // 修改为目录路径
			
 
				+    private static void split(boolean split,String pattern) {
			
 
				+        //String inputDirectoryPath = "E:\\project\\vscode\\《急诊与灾难医学（第4版）》\\"; // 修改为目录路径
			
 
				+        File inputDirectory = new File(inputDirectoryPath);
			
 
				+        traverse(inputDirectory, split,pattern);
			
 
				+    }
			
 
				+
			
 
				+    private static void traverse(File dir, boolean split,String pattern) {
			
 
				+        boolean hasSubDir = false;
			
 
				+        File[] files = dir.listFiles();
			
 
				+        if (files == null) {
			
 
				+            return;
			
 
				+        }
			
 
				+        // 第一阶段：检测是否存在子目录
			
 
				+        for (File file : files) {
			
 
				+            if (file.isDirectory()) {
			
 
				+                hasSubDir = true;
			
 
				+                break; // 发现子目录立即中断循环
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // 第二阶段：根据检测结果处理
			
 
				+        if (hasSubDir) {
			
 
				+            // 存在子目录时递归遍历
			
 
				+            for (File file : files) {
			
 
				+                if (file.isDirectory()) {
			
 
				+                    traverse(file, split,pattern);
			
 
				+                }
			
 
				+            }
			
 
				+        } else {
			
 
				+            zhinanSplitter txtSplitter = new zhinanSplitter();
			
 
				+            // 无子目录时处理txt文件
			
 
				+            for (File file : files) {
			
 
				+                if (file.getName().toLowerCase().endsWith(".txt")) {
			
 
				+                    //System.out.println("发现文本文件: " + file.getName());
			
 
				+                    String inputFilePath = file.getAbsolutePath();
			
 
				+                    try {
			
 
				+                        // 调用splitTxtFile方法
			
 
				+                        if (!split) {
			
 
				+                            txtSplitter.splitTxtFile(inputFilePath, file.getParent(), pattern);
			
 
				+                        } else {
			
 
				+                            txtSplitter.splitTxtFileByPattern(inputFilePath, file.getParent(), pattern);
			
 
				+                        }
			
 
				+                    } catch (IOException e) {
			
 
				+                        e.printStackTrace();
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    public void splitTxtFileByPattern(String inputFilePath, String outputDirectory, String pattern) throws IOException {
			
 
				+        File inputFile = new File(inputFilePath);
			
 
				+        if (!inputFile.exists()) {
			
 
				+            System.out.println("输入文件不存在: " + inputFilePath);
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        // 检查目录是否存在，如果不存在则创建目录
			
 
				+        File outputDir = new File(outputDirectory);
			
 
				+        if (!outputDir.exists()) {
			
 
				+            boolean isCreated = outputDir.mkdirs();
			
 
				+            if (isCreated) {
			
 
				+                //System.out.println("目录已创建: " + outputDirectory);
			
 
				+            } else {
			
 
				+                //System.out.println("目录创建失败: " + outputDirectory);
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        List<String> paragraphs = new ArrayList<>();
			
 
				+        StringBuilder currentParagraph = new StringBuilder();
			
 
				+        Pattern compile = Pattern.compile(pattern);
			
 
				+        try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
			
 
				+            String line;
			
 
				+            while ((line = reader.readLine()) != null) {
			
 
				+                line = trim(line);
			
 
				+                Matcher matcher = compile.matcher(line);
			
 
				+                if (matcher.find()) {
			
 
				+                    //找到了新的一段，且上一段内容超过了最大长度的1/4则保存，并清空
			
 
				+                    if (currentParagraph.length() > 0) {
			
 
				+                        paragraphs.add(currentParagraph.toString());
			
 
				+                        currentParagraph.setLength(0); // Clear the StringBuilder
			
 
				+                    }
			
 
				+                }
			
 
				+                currentParagraph.append(line);
			
 
				+                //currentParagraph.append("\n");
			
 
				+            }
			
 
				+            if (currentParagraph.length() > 0) {
			
 
				+                paragraphs.add(currentParagraph.toString());
			
 
				+            }
			
 
				+        }
			
 
				+        StringBuilder merge = new StringBuilder();
			
 
				+        int i = 0;
			
 
				+        String lastTitle = "";
			
 
				+        String title = "";
			
 
				+        for (int j=0;j< paragraphs.size();j++) {
			
 
				+            String temp = paragraphs.get(j);
			
 
				+            Matcher matcher = compile.matcher(temp);
			
 
				+
			
 
				+            boolean finded = matcher.find();
			
 
				+            if(finded){
			
 
				+                lastTitle = title;
			
 
				+                title = matcher.group();
			
 
				+            }
			
 
				+            if (merge.length() + temp.length() > max_leng) {
			
 
				+                //因为merge没有超过最大长度的1/3，说明是temp超过最大长度的2/3，所以切割temp
			
 
				+                //System.out.println(temp);
			
 
				+                List<String> sentences = SentenceSplitter.splitSentences(temp);
			
 
				+                for (String sentence : sentences) {
			
 
				+                    if ((merge.length() + sentence.length()) > max_leng) {
			
 
				+                        if(merge.length()>max_leng){
			
 
				+                            System.out.println(merge.length());
			
 
				+                        }
			
 
				+                        saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,title);
			
 
				+                        merge.append(sentence);
			
 
				+                    } else {
			
 
				+                        merge.append(sentence);
			
 
				+                    }
			
 
				+                }
			
 
				+                //merge.append("\n");
			
 
				+            } else {
			
 
				+                boolean onlyTitle = finded && trim(temp).equals(title);
			
 
				+                //如果temp是标题，且merge+temp超过最大长度的1/3，则merge保存，再追加temp，以保证标题不会出现在切片的末尾
			
 
				+                if (onlyTitle && (merge.length() + temp.length()) > max_leng / 3) {
			
 
				+                    if(merge.length()>max_leng){
			
 
				+                        System.out.println(merge.length());
			
 
				+                    }
			
 
				+                    saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
			
 
				+                }
			
 
				+                if(temp.length()>merge.length()){
			
 
				+                    lastTitle = title;
			
 
				+                }
			
 
				+                merge.append(temp);
			
 
				+            }
			
 
				+
			
 
				+            //超过最大长度的1/3则直接保存或是最后的片段
			
 
				+            if (merge.length() > max_leng / 3) {
			
 
				+                if(merge.length()>max_leng){
			
 
				+                    System.out.println(merge.length());
			
 
				+                }
			
 
				+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
			
 
				+            }else if(j==paragraphs.size()-1){
			
 
				+                if(merge.length()>max_leng){
			
 
				+                    System.out.println(merge.length());
			
 
				+                }
			
 
				+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,StringUtils.isNotEmpty(title)?title:lastTitle);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}