Browse Source

代码提交

SGTY 2 tuần trước cách đây
mục cha
commit
654c125ce5

+ 5 - 0
pom.xml

@@ -215,6 +215,11 @@
             <artifactId>commons-collections4</artifactId>
             <version>4.4</version>
         </dependency>
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi-scratchpad</artifactId>
+            <version>5.2.3</version>
+        </dependency>
 
     </dependencies>
     <dependencyManagement>

+ 91 - 36
src/main/java/com/qizhen/healsphere/TxtSplitter.java

@@ -14,7 +14,8 @@ import java.util.regex.Pattern;
 
 public class TxtSplitter {
     private static final int max_leng = 500;
-    private static final String JIE_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+节\\s+[^\n]+";
+    private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s*[^\\n]+";
+    private static final String JIE_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+节\\s*\\|?\\s*[^\n]+";
     private static final String TITLE1_PATTERN = "^[零一二三四五六七八九十百千万]+\\s*、\\s*(.*)";
     private static final String TITLE2_PATTERN = "^\\(([零一二三四五六七八九十百千万]+)\\)(.*)";
     Pattern pageNumberCompile = Pattern.compile("^\\s*(\\.\\s+)*[0-9]+(\\s+\\.\\s*)?\\s*$");
@@ -43,6 +44,8 @@ public class TxtSplitter {
         try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
             String line;
             while ((line = reader.readLine()) != null) {
+                line = trim( line);
+
                 if (StringUtils.isNotEmpty(line) &&  pageNumberCompile.matcher(line).find()) {
                     continue;
                 }
@@ -71,11 +74,13 @@ public class TxtSplitter {
         File outputFile = new File(outputDirectory, fileName);
 
         try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
-            String s = inputFile.getParent().replace("E:\\project\\vscode\\", "").replaceAll("\\\\", " ,");
+            String s = inputFile.getParent().replace(inputDirectoryPath, "").replaceAll("\\\\", ",");
             if(StringUtils.isNotEmpty(title)){
-                writer.write("本切片内容来自:"+s+","+title+"\n");
+                //writer.write("本切片内容来自:"+s+","+title+"\n");
+                writer.write("本切片内容来自:"+s+","+title+"。");
             }else{
-                writer.write("本切片内容来自:"+s+"\n");
+                //writer.write("本切片内容来自:"+s+"\n");
+                writer.write("本切片内容来自:"+s+"。");
             }
             writer.write(merge.toString());
 
@@ -116,16 +121,18 @@ public class TxtSplitter {
     }
 
     private String trim(String text) {
+        // 去除所有空白字符
         return text.replaceAll("\\s+", "");
     }
 
     public static void main(String[] args) {
+        //split(false,ZHANG_PATTERN);
         split(false,JIE_PATTERN);
         split(false,TITLE1_PATTERN);
         split(true,TITLE2_PATTERN);
     }
+    public static final  String inputDirectoryPath = "E:\\打标资料\\"; // 修改为目录路径
     private static void split(boolean split,String pattern) {
-        String inputDirectoryPath = "E:\\project\\vscode\\《急诊与灾难医学(第4版)》\\"; // 修改为目录路径
         //String inputDirectoryPath = "E:\\project\\vscode\\《急诊与灾难医学(第4版)》\\"; // 修改为目录路径
         File inputDirectory = new File(inputDirectoryPath);
         traverse(inputDirectory, split,pattern);
@@ -200,64 +207,112 @@ public class TxtSplitter {
         try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
             String line;
             while ((line = reader.readLine()) != null) {
+                line = trim(line);
                 Matcher matcher = compile.matcher(line);
                 if (matcher.find()) {
-                    //找到了新的一段,且上一段内容超过了最大长度的1/4则保存,并清空
+                    // 找到了新的一段,且上一段内容超过了最大长度的1/4则保存,并清空
                     if (currentParagraph.length() > 0) {
                         paragraphs.add(currentParagraph.toString());
                         currentParagraph.setLength(0); // Clear the StringBuilder
                     }
                 }
-                currentParagraph.append(line).append("\n");
+                currentParagraph.append(line);
+                currentParagraph.append("\n");
             }
             if (currentParagraph.length() > 0) {
                 paragraphs.add(currentParagraph.toString());
             }
         }
+
+        // 对paragraphs中的每个段落进行处理,确保长度不超过max_leng
+        List<String> processedParagraphs = new ArrayList<>();
+        for (String paragraph : paragraphs) {
+            if (paragraph.length() > max_leng) {
+                // 如果段落长度超过max_leng,则分割并添加到processedParagraphs
+                processedParagraphs.addAll(splitParagraph(paragraph, max_leng));
+            } else {
+                // 如果段落长度未超过max_leng,则直接添加
+                processedParagraphs.add(paragraph);
+            }
+        }
+
         StringBuilder merge = new StringBuilder();
         int i = 0;
         String lastTitle = "";
         String title = "";
-        for (int j=0;j< paragraphs.size();j++) {
-            String temp = paragraphs.get(j);
+        for (int j = 0; j < processedParagraphs.size(); j++) {
+            String temp = processedParagraphs.get(j);
             Matcher matcher = compile.matcher(temp);
 
             boolean finded = matcher.find();
-            if(finded){
+            if (finded) {
                 lastTitle = title;
                 title = matcher.group();
             }
+
+            // 如果merge+temp会超过max_leng,则先保存merge
             if (merge.length() + temp.length() > max_leng) {
-                //因为merge没有超过最大长度的1/3,说明是temp超过最大长度的2/3,所以切割temp
-                //System.out.println(temp);
-                List<String> sentences = SentenceSplitter.splitSentences(temp);
-                for (String sentence : sentences) {
-                    if ((merge.length() + sentence.length()) > max_leng) {
-                        saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,title);
-                        merge.append(sentence);
-                    } else {
-                        merge.append(sentence);
-                    }
-                }
-                merge.append("\n");
-            } else {
-                boolean onlyTitle = finded && trim(temp).equals(title);
-                //如果temp是标题,且merge+temp超过最大长度的1/3,则merge保存,再追加temp,以保证标题不会出现在切片的末尾
-                if (onlyTitle && (merge.length() + temp.length()) > max_leng / 3) {
-                    saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
-                }
-                if(temp.length()>merge.length()){
-                    lastTitle = title;
+
+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge, StringUtils.isNotEmpty(title) ? title : lastTitle);
+                merge.setLength(0); // 清空merge
+            }
+
+            // 添加当前段落到merge
+            merge.append(temp);
+
+            // 如果merge接近max_leng的一半,考虑提前保存以保持均衡
+            if (merge.length() > max_leng / 2 && !temp.equals(title)) {
+
+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge, StringUtils.isNotEmpty(title) ? title : lastTitle);
+                merge.setLength(0); // 清空merge
+            }
+        }
+
+        // 最后一次保存剩余内容
+        if (merge.length() > 0) {
+
+            saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge, StringUtils.isNotEmpty(title) ? title : lastTitle);
+        }
+    }
+
+    // 优化后的辅助方法:将长段落按指定长度分割,基于句子进行处理
+    private List<String> splitParagraph(String paragraph, int maxLength) {
+        List<String> result = new ArrayList<>();
+        List<String> sentences = SentenceSplitter.splitSentences(paragraph); // 将段落分割为句子
+        StringBuilder currentSegment = new StringBuilder();
+
+        for (String sentence : sentences) {
+            if (currentSegment.length() + sentence.length() > maxLength) {
+                // 如果添加当前句子会超出最大长度,则保存当前段落并清空
+                if (currentSegment.length() > 0) {
+                    result.add(currentSegment.toString());
                 }
-                merge.append(temp);
+                currentSegment.setLength(0);
             }
+            currentSegment.append(sentence);
+        }
+
+        // 最后一次保存剩余内容
+        if (currentSegment.length() > 0) {
+            result.add(currentSegment.toString());
+        }
 
-            //超过最大长度的1/3则直接保存或是最后的片段
-            if (merge.length() > max_leng / 3) {
-                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
-            }else if(j==paragraphs.size()-1){
-                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,StringUtils.isNotEmpty(title)?title:lastTitle);
+        // 检查结果列表中的每个段落是否超过maxLength,若超过则强制分割
+        List<String> finalResult = new ArrayList<>();
+        for (String segment : result) {
+            if (segment.length() > maxLength) {
+                // 强制分割超长段落
+                int start = 0;
+                while (start < segment.length()) {
+                    finalResult.add(segment.substring(start, Math.min(start + maxLength, segment.length())));
+                    start += maxLength;
+                }
+            } else {
+                finalResult.add(segment);
             }
         }
+
+        return finalResult;
     }
+
 }

+ 93 - 17
src/main/java/com/qizhen/healsphere/WordSplitter.java

@@ -2,6 +2,8 @@ package com.qizhen.healsphere;
 
 import com.qizhen.healsphere.util.FileCommonUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.xwpf.usermodel.*;
 
 import java.io.*;
@@ -13,9 +15,23 @@ import java.util.regex.Pattern;
 public class WordSplitter {
 
     private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s+[^\\n]+";
-    public void splitWordDocument(String inputFilePath, String outputDirectory,String pattern) throws IOException {
-        System.out.println(inputFilePath+"==="+outputDirectory+"==="+pattern);
-        XWPFDocument document = new XWPFDocument(new FileInputStream(inputFilePath));
+
+    public void splitWordDocument(String inputFilePath, String outputDirectory, String pattern) throws IOException {
+        System.out.println(inputFilePath + "===" + outputDirectory + "===" + pattern);
+        
+        // 根据文件扩展名选择合适的处理方式
+        if (inputFilePath.toLowerCase().endsWith(".docx")) {
+            XWPFDocument document = new XWPFDocument(new FileInputStream(inputFilePath));
+            processXWPFDocument(document, outputDirectory, pattern);
+        } else if (inputFilePath.toLowerCase().endsWith(".doc")) {
+            HWPFDocument document = new HWPFDocument(new FileInputStream(inputFilePath));
+            processHWPFDocument(document, outputDirectory, pattern);
+        } else {
+            throw new IllegalArgumentException("Unsupported file format: " + inputFilePath);
+        }
+    }
+
+    private void processXWPFDocument(XWPFDocument document, String outputDirectory, String pattern) throws IOException {
         StringBuilder currentChapter = new StringBuilder();
         String currentChapterTitle = null;
         Pattern compile = Pattern.compile(pattern);
@@ -43,23 +59,53 @@ public class WordSplitter {
                         && !"本章数字资源".equals(trimText)) {
                     currentChapter.append(text).append("\n");
                 }
-
-            } /*else if (element instanceof XWPFTable) {
-                // Handle tables if necessary
-                currentChapter.append(element.toString()).append("\n");
-            }*/
+            } 
         }
         save(currentChapterTitle, currentChapter.toString(), outputDirectory);
         document.close();
     }
 
+    private void processHWPFDocument(HWPFDocument document, String outputDirectory, String pattern) throws IOException {
+        Range range = document.getRange();
+        StringBuilder currentChapter = new StringBuilder();
+        String currentChapterTitle = null;
+        Pattern compile = Pattern.compile(pattern);
+
+        for (int i = 0; i < range.numParagraphs(); i++) {
+            String text = range.getParagraph(i).text();
+            Matcher matcher = compile.matcher(text);
+            boolean isAppendTitle = false;
+            if (matcher.find()) {
+                String tempTitle = matcher.group();
+                //标题一样的话,不保存
+                if (currentChapterTitle == null) {
+                    currentChapterTitle = tempTitle;
+                    isAppendTitle = true;
+                } else if (currentChapterTitle != null && !trim(currentChapterTitle).equals(trim(tempTitle))) {
+                    save(currentChapterTitle, currentChapter.toString(), outputDirectory);
+                    currentChapter.setLength(0); // Clear the StringBuilder
+                    currentChapterTitle = tempTitle;
+                    isAppendTitle = true;
+                }
+            }
+            String trimText = trim(text);
+            if (StringUtils.isNotEmpty(text) && (isAppendTitle || !trim(currentChapterTitle).equals(trimText))
+                    && !"本章数字资源".equals(trimText)) {
+                currentChapter.append(text).append("\n");
+            }
+        }
+        save(currentChapterTitle, currentChapter.toString(), outputDirectory);
+    }
+
     private void save(String chapterTitle, String chapterContents, String outputDirectory) throws IOException {
         if (StringUtils.isEmpty(chapterTitle)) {
-            return;
+            File outputDir = new File(outputDirectory);
+            chapterTitle = outputDir.getName();
+            //return;
         }
         String fileName = FileCommonUtils.sanitizeFileName(chapterTitle);
         String fileNameEXt = fileName + ".txt";
-        outputDirectory += fileName + "\\";
+        //outputDirectory += fileName + "\\";
         File outputFile = new File(outputDirectory, fileNameEXt);
         // 检查目录是否存在,如果不存在则创建目录
         File directory = new File(outputDirectory);
@@ -78,6 +124,9 @@ public class WordSplitter {
     }
 
     private String trim(String text) {
+        if (StringUtils.isBlank(text)) {
+            return "";
+        }
         return text.replaceAll("\\s+", "");
     }
 
@@ -114,13 +163,40 @@ public class WordSplitter {
 
     public static void main(String[] args) {
         WordSplitter wordSplitter1 = new WordSplitter();
-        String inputFilePath = "E:\\project\\vscode\\急诊与灾难医学(第4版).docx";
-        String outputDirectory = "E:\\project\\vscode\\《急诊与灾难医学(第4版)》\\";
-        try {
-            //removeHeadersAndFooters(inputFilePath, outputFilePath);
-            wordSplitter1.splitWordDocument(inputFilePath, outputDirectory,ZHANG_PATTERN);
-        } catch (IOException e) {
-            e.printStackTrace();
+        String baseDirectory = "E:\\打标资料\\"; // 指定要遍历的目录
+        File directory = new File(baseDirectory);
+
+        if (directory.exists() && directory.isDirectory()) {
+            processDirectory(directory, wordSplitter1);
+        } else {
+            System.out.println("指定的目录不存在或不是一个有效目录: " + baseDirectory);
+        }
+    }
+
+    private static void processDirectory(File directory, WordSplitter wordSplitter) {
+        File[] files = directory.listFiles();
+        if (files == null) {
+            return;
+        }
+
+        for (File file : files) {
+            if (file.isDirectory()) {
+                // 递归处理子目录
+                processDirectory(file, wordSplitter);
+            } else if (file.isFile() && (file.getName().toLowerCase().endsWith(".docx")|| file.getName().toLowerCase().endsWith(".doc"))) {
+                // 处理 .docx 文件
+                String inputFilePath = file.getAbsolutePath();
+                String fileNameWithoutExtension = file.getName().replaceFirst("[.][^.]+$", "");
+                String outputDirectory = directory.getAbsolutePath() + "\\" + fileNameWithoutExtension + "\\";
+                
+                try {
+                    wordSplitter.splitWordDocument(inputFilePath, outputDirectory, ZHANG_PATTERN);
+                    System.out.println("处理完成: " + inputFilePath);
+                } catch (IOException e) {
+                    System.err.println("处理文件时出错: " + inputFilePath);
+                    e.printStackTrace();
+                }
+            }
         }
     }
 }

+ 292 - 0
src/main/java/com/qizhen/healsphere/zhinanSplitter.java

@@ -0,0 +1,292 @@
+package com.qizhen.healsphere;
+
+import com.qizhen.healsphere.util.FileCommonUtils;
+import com.qizhen.healsphere.util.SentenceSplitter;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class zhinanSplitter {
+    private static final int max_leng = 500;
+    private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s*[^\\n]+";
+    private static final String JIE_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+节\\s*\\|?\\s*[^\n]+";
+    private static final String TITLE1_PATTERN = "^[零一二三四五六七八九十百千万]+\\s*、\\s*(.*)";
+    private static final String TITLE2_PATTERN = "^\\(([零一二三四五六七八九十百千万]+)\\)(.*)";
+    Pattern pageNumberCompile = Pattern.compile("^\\s*(\\.\\s+)*[0-9]+(\\s+\\.\\s*)?\\s*$");
+    public void splitTxtFile(String inputFilePath, String outputDirectory, String pattern) throws IOException {
+        File inputFile = new File(inputFilePath);
+        if (!inputFile.exists()) {
+            System.out.println("输入文件不存在: " + inputFilePath);
+            return;
+        }
+
+        // 检查目录是否存在,如果不存在则创建目录
+        File outputDir = new File(outputDirectory);
+        if (!outputDir.exists()) {
+            boolean isCreated = outputDir.mkdirs();
+            if (isCreated) {
+                //System.out.println("目录已创建: " + outputDirectory);
+            } else {
+                //System.out.println("目录创建失败: " + outputDirectory);
+                return;
+            }
+        }
+
+        StringBuilder currentChapter = new StringBuilder();
+        String currentChapterTitle = null;
+        Pattern compile = Pattern.compile(pattern);
+        try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                line = trim( line);
+
+                if (StringUtils.isNotEmpty(line) &&  pageNumberCompile.matcher(line).find()) {
+                    continue;
+                }
+                Matcher matcher = compile.matcher(line);
+                if (matcher.find()) {
+                    String tempTitle = matcher.group();
+                    if (currentChapterTitle == null) {
+                        currentChapterTitle = tempTitle;
+                        // 标题一样的话,不保存
+                    } else if (currentChapterTitle != null && !trim(currentChapterTitle).equals(trim(tempTitle))) {
+                        save(currentChapterTitle, currentChapter.toString(), outputDirectory);
+                        currentChapter.setLength(0); // Clear the StringBuilder
+                        currentChapterTitle = tempTitle;
+                    }
+                }
+                if (StringUtils.isNotEmpty(line)) {
+                    currentChapter.append(line).append("\n");
+                }
+            }
+            save(currentChapterTitle, currentChapter.toString(), outputDirectory);
+        }
+    }
+    public static final String appId = "e1de7dfc-afdb-48e4-b0f8-4fcecea09643";
+    private static int saveTxt(String inputFilePath, String outputDirectory, int i, File inputFile, StringBuilder merge,String title) throws IOException {
+        String fileName = FileCommonUtils.getFileNameWithoutExtension(inputFilePath) + "_split_" + i + ".txt";
+        File outputFile = new File(outputDirectory, fileName);
+
+        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
+            String s = inputFile.getParent().replace(inputDirectoryPath, "").replaceAll("\\\\", ",");
+            if(StringUtils.isNotEmpty(title)){
+                //writer.write("本切片内容来自:"+s+","+title+"\n");
+                writer.write("本切片内容来自:"+s+","+title+"。");
+            }else{
+                //writer.write("本切片内容来自:"+s+"\n");
+                writer.write("本切片内容来自:"+s+"。");
+            }
+            writer.write(merge.toString());
+
+          /*  String response = QizhenAssistant.call(appId, QizhenAssistant.getConversationId(appId), merge.toString());
+            String answer = JSONObject.parseObject(response).getString("answer");
+            writer.write("\n\n"+answer);*/
+            merge.setLength(0);
+        }
+        return i;
+    }
+
+    private void save(String chapterTitle, String chapterContents, String outputDirectory) throws IOException {
+        if (StringUtils.isEmpty(chapterTitle)) {
+            return;
+        }
+        String fileName = FileCommonUtils.sanitizeFileName(chapterTitle);
+        String fileNameEXt = fileName + ".txt";
+        if (!outputDirectory.endsWith("\\")) {
+            outputDirectory += "\\";
+        }
+        outputDirectory += fileName + "\\";
+        File outputFile = new File(outputDirectory, fileNameEXt);
+        // 检查目录是否存在,如果不存在则创建目录
+        File directory = new File(outputDirectory);
+        if (!directory.exists()) {
+            boolean isCreated = directory.mkdirs();
+            if (isCreated) {
+                //System.out.println("目录已创建: " + outputDirectory);
+            } else {
+                //System.out.println("目录创建失败: " + outputDirectory);
+                return;
+            }
+        }
+
+        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
+            writer.write(chapterContents);
+        }
+    }
+
+    private String trim(String text) {
+        // 去除所有空白字符
+        text = text.replaceAll("\\s+", "");
+
+        // 修正:只移除英文标点符号,保留中文内容
+        String text2 = text.replaceAll("[^\\u4e00-\\u9fa5a-zA-Z]", "");
+
+        // 如果文本长度超过10个字符且全是英文,返回空字符串
+        if (text2.length() > 10 && text2.matches("^[a-zA-Z]+$")) {
+            return "";
+        }
+
+        return text;
+    }
+
+    public static void main(String[] args) {
+        //split(false,ZHANG_PATTERN);
+        split(false,JIE_PATTERN);
+        split(false,TITLE1_PATTERN);
+        split(true,TITLE2_PATTERN);
+    }
+    public static final  String inputDirectoryPath = "E:\\打标资料\\内科学\\"; // 修改为目录路径
+    private static void split(boolean split,String pattern) {
+        //String inputDirectoryPath = "E:\\project\\vscode\\《急诊与灾难医学(第4版)》\\"; // 修改为目录路径
+        File inputDirectory = new File(inputDirectoryPath);
+        traverse(inputDirectory, split,pattern);
+    }
+
+    private static void traverse(File dir, boolean split,String pattern) {
+        boolean hasSubDir = false;
+        File[] files = dir.listFiles();
+        if (files == null) {
+            return;
+        }
+        // 第一阶段:检测是否存在子目录
+        for (File file : files) {
+            if (file.isDirectory()) {
+                hasSubDir = true;
+                break; // 发现子目录立即中断循环
+            }
+        }
+
+        // 第二阶段:根据检测结果处理
+        if (hasSubDir) {
+            // 存在子目录时递归遍历
+            for (File file : files) {
+                if (file.isDirectory()) {
+                    traverse(file, split,pattern);
+                }
+            }
+        } else {
+            zhinanSplitter txtSplitter = new zhinanSplitter();
+            // 无子目录时处理txt文件
+            for (File file : files) {
+                if (file.getName().toLowerCase().endsWith(".txt")) {
+                    //System.out.println("发现文本文件: " + file.getName());
+                    String inputFilePath = file.getAbsolutePath();
+                    try {
+                        // 调用splitTxtFile方法
+                        if (!split) {
+                            txtSplitter.splitTxtFile(inputFilePath, file.getParent(), pattern);
+                        } else {
+                            txtSplitter.splitTxtFileByPattern(inputFilePath, file.getParent(), pattern);
+                        }
+                    } catch (IOException e) {
+                        e.printStackTrace();
+                    }
+                }
+            }
+        }
+    }
+
+    public void splitTxtFileByPattern(String inputFilePath, String outputDirectory, String pattern) throws IOException {
+        File inputFile = new File(inputFilePath);
+        if (!inputFile.exists()) {
+            System.out.println("输入文件不存在: " + inputFilePath);
+            return;
+        }
+
+        // 检查目录是否存在,如果不存在则创建目录
+        File outputDir = new File(outputDirectory);
+        if (!outputDir.exists()) {
+            boolean isCreated = outputDir.mkdirs();
+            if (isCreated) {
+                //System.out.println("目录已创建: " + outputDirectory);
+            } else {
+                //System.out.println("目录创建失败: " + outputDirectory);
+                return;
+            }
+        }
+
+        List<String> paragraphs = new ArrayList<>();
+        StringBuilder currentParagraph = new StringBuilder();
+        Pattern compile = Pattern.compile(pattern);
+        try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                line = trim(line);
+                Matcher matcher = compile.matcher(line);
+                if (matcher.find()) {
+                    //找到了新的一段,且上一段内容超过了最大长度的1/4则保存,并清空
+                    if (currentParagraph.length() > 0) {
+                        paragraphs.add(currentParagraph.toString());
+                        currentParagraph.setLength(0); // Clear the StringBuilder
+                    }
+                }
+                currentParagraph.append(line);
+                //currentParagraph.append("\n");
+            }
+            if (currentParagraph.length() > 0) {
+                paragraphs.add(currentParagraph.toString());
+            }
+        }
+        StringBuilder merge = new StringBuilder();
+        int i = 0;
+        String lastTitle = "";
+        String title = "";
+        for (int j=0;j< paragraphs.size();j++) {
+            String temp = paragraphs.get(j);
+            Matcher matcher = compile.matcher(temp);
+
+            boolean finded = matcher.find();
+            if(finded){
+                lastTitle = title;
+                title = matcher.group();
+            }
+            if (merge.length() + temp.length() > max_leng) {
+                //因为merge没有超过最大长度的1/3,说明是temp超过最大长度的2/3,所以切割temp
+                //System.out.println(temp);
+                List<String> sentences = SentenceSplitter.splitSentences(temp);
+                for (String sentence : sentences) {
+                    if ((merge.length() + sentence.length()) > max_leng) {
+                        if(merge.length()>max_leng){
+                            System.out.println(merge.length());
+                        }
+                        saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,title);
+                        merge.append(sentence);
+                    } else {
+                        merge.append(sentence);
+                    }
+                }
+                //merge.append("\n");
+            } else {
+                boolean onlyTitle = finded && trim(temp).equals(title);
+                //如果temp是标题,且merge+temp超过最大长度的1/3,则merge保存,再追加temp,以保证标题不会出现在切片的末尾
+                if (onlyTitle && (merge.length() + temp.length()) > max_leng / 3) {
+                    if(merge.length()>max_leng){
+                        System.out.println(merge.length());
+                    }
+                    saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
+                }
+                if(temp.length()>merge.length()){
+                    lastTitle = title;
+                }
+                merge.append(temp);
+            }
+
+            //超过最大长度的1/3则直接保存或是最后的片段
+            if (merge.length() > max_leng / 3) {
+                if(merge.length()>max_leng){
+                    System.out.println(merge.length());
+                }
+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
+            }else if(j==paragraphs.size()-1){
+                if(merge.length()>max_leng){
+                    System.out.println(merge.length());
+                }
+                saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,StringUtils.isNotEmpty(title)?title:lastTitle);
+            }
+        }
+    }
+}