Browse Source

北仑医院首次病程录、出院记录(出院小结)解析html优化

huj 4 years ago
parent
commit
7f7bfbdaf2

+ 1 - 28
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunFirstCourseRecordHtmlAnalysis.java

@@ -1,11 +1,6 @@
 package com.lantone.qc.trans.beilun.util;
 package com.lantone.qc.trans.beilun.util;
 
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Lists;
-import com.lantone.qc.pub.util.StringUtil;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Autowired;
 
 
 import java.util.List;
 import java.util.List;
@@ -26,29 +21,7 @@ public class BeiLunFirstCourseRecordHtmlAnalysis implements BeiLunHtmlAnalysis {
                 "(四)初步诊断", "(五)诊疗计划", "医生签名");
                 "(四)初步诊断", "(五)诊疗计划", "医生签名");
         String html = args[0];
         String html = args[0];
         String recTitle = args[1];
         String recTitle = args[1];
-        Document document = Jsoup.parse(html);
-        List<String> htmlText = Lists.newArrayList();
-        Elements elements = document.select("div");
-        boolean findTitleMain = false;
-        String style = null, title = null;
-        for (Element element : elements) {
-            style = element.attr("style");
-            title = element.attr("title");
-            if (StringUtil.isBlank(style) || StringUtil.isBlank(title)) {
-                continue;
-            }
-            if ("".equals(style) && title.contains(".odt")) {
-                break;
-            }
-            if ("".equals(style) && "main".equals(title)) {
-                findTitleMain = true;
-                continue;
-            }
-            if (findTitleMain) {
-                String text = element.text();
-                htmlText.add(text);
-            }
-        }
+        List<String> htmlText = commonAnalysisUtil.html2List(html);
         Map<String, String> structureMap = commonAnalysisUtil.html2StructureMap(titles, htmlText);
         Map<String, String> structureMap = commonAnalysisUtil.html2StructureMap(titles, htmlText);
         structureMap.put("rec_title=", "107");
         structureMap.put("rec_title=", "107");
 
 

+ 1 - 12
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunLeaveHospitalHtmlAnalysis.java

@@ -1,11 +1,6 @@
 package com.lantone.qc.trans.beilun.util;
 package com.lantone.qc.trans.beilun.util;
 
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Autowired;
 
 
 import java.util.List;
 import java.util.List;
@@ -26,13 +21,7 @@ public class BeiLunLeaveHospitalHtmlAnalysis implements BeiLunHtmlAnalysis {
                 "入院情况", "入院后检查化验结果", "诊疗经过", "出院计划", "病理检查结果", "出院情况", "治疗效果", "出院医嘱", "医师签名", "时间");
                 "入院情况", "入院后检查化验结果", "诊疗经过", "出院计划", "病理检查结果", "出院情况", "治疗效果", "出院医嘱", "医师签名", "时间");
         String html = args[0];
         String html = args[0];
         String recTitle = args[1];
         String recTitle = args[1];
-        Document document = Jsoup.parse(html);
-        List<String> htmlText = Lists.newArrayList();
-        Elements elements = document.select("div");
-        for (Element element : elements) {
-            String text = element.text();
-            htmlText.add(text);
-        }
+        List<String> htmlText = commonAnalysisUtil.html2List(html);
         htmlText.remove(0);//去除第一个div内容
         htmlText.remove(0);//去除第一个div内容
         Map<String, String> structureMap = commonAnalysisUtil.html2StructureMap(titles, htmlText);
         Map<String, String> structureMap = commonAnalysisUtil.html2StructureMap(titles, htmlText);
         structureMap.put("rec_title=", "183");
         structureMap.put("rec_title=", "183");

+ 50 - 14
trans/src/main/java/com/lantone/qc/trans/beilun/util/CommonAnalysisUtil.java

@@ -1,6 +1,12 @@
 package com.lantone.qc.trans.beilun.util;
 package com.lantone.qc.trans.beilun.util;
 
 
+import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Maps;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.Elements;
 
 
 import java.util.List;
 import java.util.List;
 import java.util.Map;
 import java.util.Map;
@@ -11,15 +17,45 @@ import java.util.Map;
  * @Date: 2020/9/10 13:48
  * @Date: 2020/9/10 13:48
  */
  */
 public class CommonAnalysisUtil {
 public class CommonAnalysisUtil {
+
+    /**
+     * 将html内容以行为单位存进list
+     *
+     * @param html 原始html内容
+     * @return
+     */
+    public List<String> html2List(String html) {
+        List<String> htmlText = Lists.newArrayList();
+        Document document = Jsoup.parse(html);
+        Element body = document.select("body").first();
+        List<Node> nodes = body.childNodes();
+        List<Node> subNodes = nodes.get(0).childNodes();
+        boolean findNode = false;
+        for (Node node : subNodes) {
+            if ("hr".equals(node.nodeName())) {
+                findNode = true;
+                continue;
+            }
+            if (findNode) {
+                Element element = (Element) node;
+                Elements elements = element.select("div");
+                for (Element e : elements) {
+                    htmlText.add(e.text());
+                }
+            }
+        }
+        return htmlText;
+    }
+
     /**
     /**
-     * 将html内容转换成structureMap
+     * 将list中html内容转换成structureMap
      *
      *
-     * @param titles
-     * @param htmlText
+     * @param titles   文书各标题
+     * @param htmlText html内容以行的形式存储list
      * @return
      * @return
      */
      */
     public Map<String, String> html2StructureMap(List<String> titles, List<String> htmlText) {
     public Map<String, String> html2StructureMap(List<String> titles, List<String> htmlText) {
-        Map<String, String> structmap = Maps.newLinkedHashMap();
+        Map<String, String> structureMap = Maps.newLinkedHashMap();
         StringBuffer sb = new StringBuffer();
         StringBuffer sb = new StringBuffer();
         for (String line : htmlText) {
         for (String line : htmlText) {
             String text = line.replaceAll("[   ]", "");
             String text = line.replaceAll("[   ]", "");
@@ -28,19 +64,19 @@ public class CommonAnalysisUtil {
             }
             }
             sb.append(text).append("\n");
             sb.append(text).append("\n");
         }
         }
-        cutByTitles(sb.toString(), titles, 0, structmap);
-        return structmap;
+        cutByTitles(sb.toString(), titles, 0, structureMap);
+        return structureMap;
     }
     }
 
 
     /**
     /**
      * 根据文书各标题截取相应文本,存入structmap中
      * 根据文书各标题截取相应文本,存入structmap中
      *
      *
-     * @param line      原始文本
-     * @param titles    文书各标题
-     * @param depth     递归深度,也就是titles取值时的下标值
-     * @param structmap 存储结构化数据
+     * @param line         原始文本
+     * @param titles       文书各标题
+     * @param depth        递归深度,也就是titles取值时的下标值
+     * @param structureMap 存储结构化数据
      */
      */
-    private void cutByTitles(String line, List<String> titles, int depth, Map<String, String> structmap) {
+    private void cutByTitles(String line, List<String> titles, int depth, Map<String, String> structureMap) {
         if (depth > titles.size()) {
         if (depth > titles.size()) {
             return;
             return;
         }
         }
@@ -50,19 +86,19 @@ public class CommonAnalysisUtil {
         newTitle = title + ":";
         newTitle = title + ":";
         if (depth == titles.size()) {
         if (depth == titles.size()) {
             value = line.substring(0, line.indexOf("\n"));
             value = line.substring(0, line.indexOf("\n"));
-            structmap.put(beforeTitle, value.trim());
+            structureMap.put(beforeTitle, value.trim());
             return;
             return;
         }
         }
         if (line.contains(newTitle)) {
         if (line.contains(newTitle)) {
             if (depth > 0) {
             if (depth > 0) {
                 value = line.substring(0, line.indexOf(newTitle));
                 value = line.substring(0, line.indexOf(newTitle));
-                structmap.put(beforeTitle, value.trim());
+                structureMap.put(beforeTitle, value.trim());
             }
             }
             line = line.substring(line.indexOf(newTitle) + newTitle.length());
             line = line.substring(line.indexOf(newTitle) + newTitle.length());
             depth++;
             depth++;
         } else {
         } else {
             titles.remove(depth);
             titles.remove(depth);
         }
         }
-        cutByTitles(line, titles, depth, structmap);
+        cutByTitles(line, titles, depth, structureMap);
     }
     }
 }
 }