|
@@ -0,0 +1,292 @@
|
|
|
+package com.qizhen.healsphere;
|
|
|
+
|
|
|
+import com.qizhen.healsphere.util.FileCommonUtils;
|
|
|
+import com.qizhen.healsphere.util.SentenceSplitter;
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
+
|
|
|
+import java.io.*;
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.List;
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+
|
|
|
+public class zhinanSplitter {
|
|
|
+ private static final int max_leng = 500;
|
|
|
+ private static final String ZHANG_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+章\\s*[^\\n]+";
|
|
|
+ private static final String JIE_PATTERN = "^第[零一二三四五六七八九十百千万0123456789]+节\\s*\\|?\\s*[^\n]+";
|
|
|
+ private static final String TITLE1_PATTERN = "^[零一二三四五六七八九十百千万]+\\s*、\\s*(.*)";
|
|
|
+ private static final String TITLE2_PATTERN = "^\\(([零一二三四五六七八九十百千万]+)\\)(.*)";
|
|
|
+ Pattern pageNumberCompile = Pattern.compile("^\\s*(\\.\\s+)*[0-9]+(\\s+\\.\\s*)?\\s*$");
|
|
|
+ public void splitTxtFile(String inputFilePath, String outputDirectory, String pattern) throws IOException {
|
|
|
+ File inputFile = new File(inputFilePath);
|
|
|
+ if (!inputFile.exists()) {
|
|
|
+ System.out.println("输入文件不存在: " + inputFilePath);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 检查目录是否存在,如果不存在则创建目录
|
|
|
+ File outputDir = new File(outputDirectory);
|
|
|
+ if (!outputDir.exists()) {
|
|
|
+ boolean isCreated = outputDir.mkdirs();
|
|
|
+ if (isCreated) {
|
|
|
+ //System.out.println("目录已创建: " + outputDirectory);
|
|
|
+ } else {
|
|
|
+ //System.out.println("目录创建失败: " + outputDirectory);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ StringBuilder currentChapter = new StringBuilder();
|
|
|
+ String currentChapterTitle = null;
|
|
|
+ Pattern compile = Pattern.compile(pattern);
|
|
|
+ try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
|
|
|
+ String line;
|
|
|
+ while ((line = reader.readLine()) != null) {
|
|
|
+ line = trim( line);
|
|
|
+
|
|
|
+ if (StringUtils.isNotEmpty(line) && pageNumberCompile.matcher(line).find()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ Matcher matcher = compile.matcher(line);
|
|
|
+ if (matcher.find()) {
|
|
|
+ String tempTitle = matcher.group();
|
|
|
+ if (currentChapterTitle == null) {
|
|
|
+ currentChapterTitle = tempTitle;
|
|
|
+ // 标题一样的话,不保存
|
|
|
+ } else if (currentChapterTitle != null && !trim(currentChapterTitle).equals(trim(tempTitle))) {
|
|
|
+ save(currentChapterTitle, currentChapter.toString(), outputDirectory);
|
|
|
+ currentChapter.setLength(0); // Clear the StringBuilder
|
|
|
+ currentChapterTitle = tempTitle;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (StringUtils.isNotEmpty(line)) {
|
|
|
+ currentChapter.append(line).append("\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ save(currentChapterTitle, currentChapter.toString(), outputDirectory);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ public static final String appId = "e1de7dfc-afdb-48e4-b0f8-4fcecea09643";
|
|
|
+ private static int saveTxt(String inputFilePath, String outputDirectory, int i, File inputFile, StringBuilder merge,String title) throws IOException {
|
|
|
+ String fileName = FileCommonUtils.getFileNameWithoutExtension(inputFilePath) + "_split_" + i + ".txt";
|
|
|
+ File outputFile = new File(outputDirectory, fileName);
|
|
|
+
|
|
|
+ try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
|
|
|
+ String s = inputFile.getParent().replace(inputDirectoryPath, "").replaceAll("\\\\", ",");
|
|
|
+ if(StringUtils.isNotEmpty(title)){
|
|
|
+ //writer.write("本切片内容来自:"+s+","+title+"\n");
|
|
|
+ writer.write("本切片内容来自:"+s+","+title+"。");
|
|
|
+ }else{
|
|
|
+ //writer.write("本切片内容来自:"+s+"\n");
|
|
|
+ writer.write("本切片内容来自:"+s+"。");
|
|
|
+ }
|
|
|
+ writer.write(merge.toString());
|
|
|
+
|
|
|
+ /* String response = QizhenAssistant.call(appId, QizhenAssistant.getConversationId(appId), merge.toString());
|
|
|
+ String answer = JSONObject.parseObject(response).getString("answer");
|
|
|
+ writer.write("\n\n"+answer);*/
|
|
|
+ merge.setLength(0);
|
|
|
+ }
|
|
|
+ return i;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void save(String chapterTitle, String chapterContents, String outputDirectory) throws IOException {
|
|
|
+ if (StringUtils.isEmpty(chapterTitle)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ String fileName = FileCommonUtils.sanitizeFileName(chapterTitle);
|
|
|
+ String fileNameEXt = fileName + ".txt";
|
|
|
+ if (!outputDirectory.endsWith("\\")) {
|
|
|
+ outputDirectory += "\\";
|
|
|
+ }
|
|
|
+ outputDirectory += fileName + "\\";
|
|
|
+ File outputFile = new File(outputDirectory, fileNameEXt);
|
|
|
+ // 检查目录是否存在,如果不存在则创建目录
|
|
|
+ File directory = new File(outputDirectory);
|
|
|
+ if (!directory.exists()) {
|
|
|
+ boolean isCreated = directory.mkdirs();
|
|
|
+ if (isCreated) {
|
|
|
+ //System.out.println("目录已创建: " + outputDirectory);
|
|
|
+ } else {
|
|
|
+ //System.out.println("目录创建失败: " + outputDirectory);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputFile))) {
|
|
|
+ writer.write(chapterContents);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private String trim(String text) {
|
|
|
+ // 去除所有空白字符
|
|
|
+ text = text.replaceAll("\\s+", "");
|
|
|
+
|
|
|
+ // 修正:只移除英文标点符号,保留中文内容
|
|
|
+ String text2 = text.replaceAll("[^\\u4e00-\\u9fa5a-zA-Z]", "");
|
|
|
+
|
|
|
+ // 如果文本长度超过10个字符且全是英文,返回空字符串
|
|
|
+ if (text2.length() > 10 && text2.matches("^[a-zA-Z]+$")) {
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void main(String[] args) {
|
|
|
+ //split(false,ZHANG_PATTERN);
|
|
|
+ split(false,JIE_PATTERN);
|
|
|
+ split(false,TITLE1_PATTERN);
|
|
|
+ split(true,TITLE2_PATTERN);
|
|
|
+ }
|
|
|
+ public static final String inputDirectoryPath = "E:\\打标资料\\内科学\\"; // 修改为目录路径
|
|
|
+ private static void split(boolean split,String pattern) {
|
|
|
+ //String inputDirectoryPath = "E:\\project\\vscode\\《急诊与灾难医学(第4版)》\\"; // 修改为目录路径
|
|
|
+ File inputDirectory = new File(inputDirectoryPath);
|
|
|
+ traverse(inputDirectory, split,pattern);
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void traverse(File dir, boolean split,String pattern) {
|
|
|
+ boolean hasSubDir = false;
|
|
|
+ File[] files = dir.listFiles();
|
|
|
+ if (files == null) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ // 第一阶段:检测是否存在子目录
|
|
|
+ for (File file : files) {
|
|
|
+ if (file.isDirectory()) {
|
|
|
+ hasSubDir = true;
|
|
|
+ break; // 发现子目录立即中断循环
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 第二阶段:根据检测结果处理
|
|
|
+ if (hasSubDir) {
|
|
|
+ // 存在子目录时递归遍历
|
|
|
+ for (File file : files) {
|
|
|
+ if (file.isDirectory()) {
|
|
|
+ traverse(file, split,pattern);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ zhinanSplitter txtSplitter = new zhinanSplitter();
|
|
|
+ // 无子目录时处理txt文件
|
|
|
+ for (File file : files) {
|
|
|
+ if (file.getName().toLowerCase().endsWith(".txt")) {
|
|
|
+ //System.out.println("发现文本文件: " + file.getName());
|
|
|
+ String inputFilePath = file.getAbsolutePath();
|
|
|
+ try {
|
|
|
+ // 调用splitTxtFile方法
|
|
|
+ if (!split) {
|
|
|
+ txtSplitter.splitTxtFile(inputFilePath, file.getParent(), pattern);
|
|
|
+ } else {
|
|
|
+ txtSplitter.splitTxtFileByPattern(inputFilePath, file.getParent(), pattern);
|
|
|
+ }
|
|
|
+ } catch (IOException e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public void splitTxtFileByPattern(String inputFilePath, String outputDirectory, String pattern) throws IOException {
|
|
|
+ File inputFile = new File(inputFilePath);
|
|
|
+ if (!inputFile.exists()) {
|
|
|
+ System.out.println("输入文件不存在: " + inputFilePath);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 检查目录是否存在,如果不存在则创建目录
|
|
|
+ File outputDir = new File(outputDirectory);
|
|
|
+ if (!outputDir.exists()) {
|
|
|
+ boolean isCreated = outputDir.mkdirs();
|
|
|
+ if (isCreated) {
|
|
|
+ //System.out.println("目录已创建: " + outputDirectory);
|
|
|
+ } else {
|
|
|
+ //System.out.println("目录创建失败: " + outputDirectory);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ List<String> paragraphs = new ArrayList<>();
|
|
|
+ StringBuilder currentParagraph = new StringBuilder();
|
|
|
+ Pattern compile = Pattern.compile(pattern);
|
|
|
+ try (BufferedReader reader = new BufferedReader(new FileReader(inputFile))) {
|
|
|
+ String line;
|
|
|
+ while ((line = reader.readLine()) != null) {
|
|
|
+ line = trim(line);
|
|
|
+ Matcher matcher = compile.matcher(line);
|
|
|
+ if (matcher.find()) {
|
|
|
+ //找到了新的一段,且上一段内容超过了最大长度的1/4则保存,并清空
|
|
|
+ if (currentParagraph.length() > 0) {
|
|
|
+ paragraphs.add(currentParagraph.toString());
|
|
|
+ currentParagraph.setLength(0); // Clear the StringBuilder
|
|
|
+ }
|
|
|
+ }
|
|
|
+ currentParagraph.append(line);
|
|
|
+ //currentParagraph.append("\n");
|
|
|
+ }
|
|
|
+ if (currentParagraph.length() > 0) {
|
|
|
+ paragraphs.add(currentParagraph.toString());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ StringBuilder merge = new StringBuilder();
|
|
|
+ int i = 0;
|
|
|
+ String lastTitle = "";
|
|
|
+ String title = "";
|
|
|
+ for (int j=0;j< paragraphs.size();j++) {
|
|
|
+ String temp = paragraphs.get(j);
|
|
|
+ Matcher matcher = compile.matcher(temp);
|
|
|
+
|
|
|
+ boolean finded = matcher.find();
|
|
|
+ if(finded){
|
|
|
+ lastTitle = title;
|
|
|
+ title = matcher.group();
|
|
|
+ }
|
|
|
+ if (merge.length() + temp.length() > max_leng) {
|
|
|
+ //因为merge没有超过最大长度的1/3,说明是temp超过最大长度的2/3,所以切割temp
|
|
|
+ //System.out.println(temp);
|
|
|
+ List<String> sentences = SentenceSplitter.splitSentences(temp);
|
|
|
+ for (String sentence : sentences) {
|
|
|
+ if ((merge.length() + sentence.length()) > max_leng) {
|
|
|
+ if(merge.length()>max_leng){
|
|
|
+ System.out.println(merge.length());
|
|
|
+ }
|
|
|
+ saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,title);
|
|
|
+ merge.append(sentence);
|
|
|
+ } else {
|
|
|
+ merge.append(sentence);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //merge.append("\n");
|
|
|
+ } else {
|
|
|
+ boolean onlyTitle = finded && trim(temp).equals(title);
|
|
|
+ //如果temp是标题,且merge+temp超过最大长度的1/3,则merge保存,再追加temp,以保证标题不会出现在切片的末尾
|
|
|
+ if (onlyTitle && (merge.length() + temp.length()) > max_leng / 3) {
|
|
|
+ if(merge.length()>max_leng){
|
|
|
+ System.out.println(merge.length());
|
|
|
+ }
|
|
|
+ saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
|
|
|
+ }
|
|
|
+ if(temp.length()>merge.length()){
|
|
|
+ lastTitle = title;
|
|
|
+ }
|
|
|
+ merge.append(temp);
|
|
|
+ }
|
|
|
+
|
|
|
+ //超过最大长度的1/3则直接保存或是最后的片段
|
|
|
+ if (merge.length() > max_leng / 3) {
|
|
|
+ if(merge.length()>max_leng){
|
|
|
+ System.out.println(merge.length());
|
|
|
+ }
|
|
|
+ saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,lastTitle);
|
|
|
+ }else if(j==paragraphs.size()-1){
|
|
|
+ if(merge.length()>max_leng){
|
|
|
+ System.out.println(merge.length());
|
|
|
+ }
|
|
|
+ saveTxt(inputFilePath, outputDirectory, i++, inputFile, merge,StringUtils.isNotEmpty(title)?title:lastTitle);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|