Jelajahi Sumber

北仑医院首次病程录、出院记录(出院小结)解析html优化

huj 4 tahun lalu
induk
melakukan
38ecd3857e

+ 5 - 0
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunFirstCourseRecordHtmlAnalysis.java

@@ -1,6 +1,7 @@
 package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
+import com.lantone.qc.pub.util.StringUtil;
 import org.springframework.beans.factory.annotation.Autowired;
 
 import java.util.List;
@@ -23,6 +24,10 @@ public class BeiLunFirstCourseRecordHtmlAnalysis implements BeiLunHtmlAnalysis {
         String recTitle = args[1];
         List<String> htmlText = commonAnalysisUtil.html2List(html);
         Map<String, String> structureMap = commonAnalysisUtil.html2StructureMap(titles, htmlText);
+        String date = commonAnalysisUtil.extractDate(htmlText.get(0));
+        if (StringUtil.isNotBlank(date)) {
+            structureMap.put("时间", date);
+        }
         structureMap.put("rec_title=", "107");
 
         return structureMap;

+ 22 - 2
trans/src/main/java/com/lantone/qc/trans/beilun/util/CommonAnalysisUtil.java

@@ -10,6 +10,8 @@ import org.jsoup.select.Elements;
 
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * @Description :
@@ -19,7 +21,7 @@ import java.util.Map;
 public class CommonAnalysisUtil {
 
     /**
-     * 将html内容以行为单位存进list
+     * 将html内容以行为单位存进list,从<hr>之后开始处理
      *
      * @param html 原始html内容
      * @return
@@ -44,6 +46,9 @@ public class CommonAnalysisUtil {
                 }
             }
         }
+        if (htmlText.get(0).length() > 200) {
+            htmlText.remove(0);
+        }
         return htmlText;
     }
 
@@ -58,7 +63,7 @@ public class CommonAnalysisUtil {
         Map<String, String> structureMap = Maps.newLinkedHashMap();
         StringBuffer sb = new StringBuffer();
         for (String line : htmlText) {
-            String text = line.replaceAll("[   ]", "");
+            String text = line.replaceAll("[   ]", " ");
             if (text.length() == 0) {
                 continue;
             }
@@ -101,4 +106,19 @@ public class CommonAnalysisUtil {
         }
         cutByTitles(line, titles, depth, structureMap);
     }
+
+    /**
+     * 抽取文本中的第一个时间
+     *
+     * @param top
+     * @return
+     */
+    public String extractDate(String top) {
+        Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}[ ][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?");
+        Matcher matcher = pattern.matcher(top);
+        if (matcher.find()) {
+            return matcher.group(0);
+        }
+        return null;
+    }
 }