Bläddra i källkod

1.html解析修改解析方式

huj 4 år sedan
förälder
incheckning
94147771fc

+ 23 - 14
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunConsultationHtmlAnalysis.java

@@ -7,7 +7,6 @@ import org.springframework.beans.factory.annotation.Autowired;
 
 import java.util.List;
 import java.util.Map;
-import java.util.stream.Collectors;
 
 /**
  * @Description:
@@ -27,16 +26,22 @@ public class BeiLunConsultationHtmlAnalysis implements BeiLunHtmlAnalysis {
         String recTitle = args[1];
         Map<String, String> structureMap = Maps.newLinkedHashMap();
         List<String> htmlText = null;
+        String htmlContent = null;
         switch (recTitle) {
             case "377":
-                htmlText = commonAnalysisUtil.html2List(html, true);
-                commonAnalysisUtil.removeRepeat(htmlText);
-                commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+                htmlContent = commonAnalysisUtil.html2String(html);
+                if (StringUtil.isNotBlank(htmlContent)) {
+                    htmlContent = htmlContent.replaceAll("[   ]", " ");
+                    commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+                }
                 structureMap.put("rec_title=", recTitle);
                 break;
             case "7883":
-                htmlText = commonAnalysisUtil.html2List(html, false);
-                commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+                htmlContent = commonAnalysisUtil.html2String(html);
+                if (StringUtil.isNotBlank(htmlContent)) {
+                    htmlContent = htmlContent.replaceAll("[   ]", " ");
+                    commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+                }
                 if (StringUtil.isNotBlank(structureMap.get("会诊时间"))) {
                     String[] strArr = structureMap.get("会诊时间").split(" ");
                     structureMap.put("会诊意见", strArr[strArr.length - 1]);
@@ -46,18 +51,22 @@ public class BeiLunConsultationHtmlAnalysis implements BeiLunHtmlAnalysis {
                 break;
             case "8084":
                 titles = Lists.newArrayList("姓名", "性别", "出生日期", "联系电话",
-                        "申请科室","入院/首诊时间", "住院号", "病情概述(含主诉、病史、诊断、诊治过程等)", "拟申请MDT时间、地点",
+                        "申请科室", "入院/首诊时间", "住院号", "病情概述(含主诉、病史、诊断、诊治过程等)", "拟申请MDT时间、地点",
                         "拟请MDT参加科室", "MDT目的", "申请人签名", "申请递交时间", "科主任签字", "专家诊治建议", "专家科室", "签名",
-                        "填写时间","主持科室小结(MDT的最终诊治决议)","科主任(主持人)签名","记录人(主管医师)签字","日期");
-                htmlText = Lists.newArrayList();
-                commonAnalysisUtil.html2ListByTable(html, htmlText);
-                htmlText = htmlText.stream().distinct().collect(Collectors.toList());
-                commonAnalysisUtil.html2StructureMapNoColon(titles, htmlText, structureMap);
+                        "填写时间", "主持科室小结(MDT的最终诊治决议)", "科主任(主持人)签名", "记录人(主管医师)签字", "日期");
+                htmlContent = commonAnalysisUtil.html2String(html);
+                if (StringUtil.isNotBlank(htmlContent)) {
+                    htmlContent = htmlContent.replaceAll("[   ]", " ");
+                    commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+                }
                 structureMap.put("rec_title=", recTitle);
                 break;
             default:
-                htmlText = commonAnalysisUtil.html2List(html, true);
-                commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+                htmlContent = commonAnalysisUtil.html2String(html);
+                if (StringUtil.isNotBlank(htmlContent)) {
+                    htmlContent = htmlContent.replaceAll("[   ]", " ");
+                    commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+                }
                 structureMap.put("rec_title=", "371");
                 break;
         }

+ 6 - 3
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunCriticallyIllNoticeHtmlAnalysis.java

@@ -2,6 +2,7 @@ package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.lantone.qc.pub.util.StringUtil;
 import org.springframework.beans.factory.annotation.Autowired;
 
 import java.util.List;
@@ -23,9 +24,11 @@ public class BeiLunCriticallyIllNoticeHtmlAnalysis implements BeiLunHtmlAnalysis
         String html = args[0];
         String recTitle = args[1];
         Map<String, String> structureMap = Maps.newLinkedHashMap();
-        List<String> htmlText = commonAnalysisUtil.html2List(html, true);
-        commonAnalysisUtil.removeRepeat(htmlText);
-        commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+        String htmlContent = commonAnalysisUtil.html2String(html);
+        if (StringUtil.isNotBlank(htmlContent)) {
+            htmlContent = htmlContent.replaceAll("[   ]", " ");
+            commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+        }
         commonAnalysisUtil.extractDateByTitle(structureMap, "告知时间");
         structureMap.put("rec_title=", "405");
         return structureMap;

+ 7 - 4
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunDeathRecordHtmlAnalysis.java

@@ -2,6 +2,7 @@ package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.lantone.qc.pub.util.StringUtil;
 import org.springframework.beans.factory.annotation.Autowired;
 
 import java.util.List;
@@ -19,13 +20,15 @@ public class BeiLunDeathRecordHtmlAnalysis implements BeiLunHtmlAnalysis {
     @Override
     public Map<String, String> analysis(String... args) {
         List<String> titles = Lists.newArrayList("入院日期", "死亡时间", "入院情况", "入院诊断",
-                "诊疗经过(重点记录病情演变、抢救经过)", "死亡原因", "死亡诊断", "医师签字");
+                "诊疗经过(重点记录病情演变、抢救经过)", "死亡原因", "死亡诊断", "医师签字", "记录时间");
         String html = args[0];
         String recTitle = args[1];
         Map<String, String> structureMap = Maps.newLinkedHashMap();
-        List<String> htmlText = commonAnalysisUtil.html2List(html, true);
-        commonAnalysisUtil.removeRepeat(htmlText);
-        commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+        String htmlContent = commonAnalysisUtil.html2String(html);
+        if (StringUtil.isNotBlank(htmlContent)) {
+            htmlContent = htmlContent.replaceAll("[   ]", " ");
+            commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+        }
         structureMap.put("rec_title=", "5254");
         return structureMap;
     }

+ 10 - 6
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunDifficultCaseDiscussHtmlAnalysis.java

@@ -2,6 +2,7 @@ package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.lantone.qc.pub.util.StringUtil;
 import org.springframework.beans.factory.annotation.Autowired;
 
 import java.util.List;
@@ -18,15 +19,18 @@ public class BeiLunDifficultCaseDiscussHtmlAnalysis implements BeiLunHtmlAnalysi
 
     @Override
     public Map<String, String> analysis(String... args) {
-        List<String> titles = Lists.newArrayList("姓名", "性别", "年龄", "床号", "住院号",
-                "住址(单位)", "讨论日期", "讨论地点", "主持人", "参加人员", "具体讨论意见", "主持人小结意见", "主持人签名",
-                "记录医生签名");
+        List<String> titles = Lists.newArrayList("姓名", "性别", "出生日期", "科别", "病区", "床号",
+                "住院号", "年龄", "床号", "住院号", "住址(单位)", "讨论日期", "讨论地点", "主持人", "参加人员", "具体讨论意见",
+                "主持人小结意见", "主持人签名", "记录医生签名");
         String html = args[0];
         String recTitle = args[1];
         Map<String, String> structureMap = Maps.newLinkedHashMap();
-        List<String> htmlText = commonAnalysisUtil.html2List(html, true);
-        commonAnalysisUtil.removeRepeat(htmlText);
-        commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+        String htmlContent = commonAnalysisUtil.html2String(html);
+        if (StringUtil.isNotBlank(htmlContent)) {
+            htmlContent = htmlContent.replaceAll("[   ]", " ");
+            commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+        }
+        commonAnalysisUtil.extractDateByTitle(structureMap, "告知时间");
         structureMap.put("rec_title=", "141");
         return structureMap;
     }

+ 5 - 5
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunFirstCourseRecordHtmlAnalysis.java

@@ -24,12 +24,12 @@ public class BeiLunFirstCourseRecordHtmlAnalysis implements BeiLunHtmlAnalysis {
         String html = args[0];
         String recTitle = args[1];
         Map<String, String> structureMap = Maps.newLinkedHashMap();
-        List<String> htmlText = commonAnalysisUtil.html2List(html, true);
-        commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
-        String date = commonAnalysisUtil.extractDate(htmlText.get(0));
-        if (StringUtil.isNotBlank(date)) {
-            structureMap.put("时间", date);
+        String htmlContent = commonAnalysisUtil.html2String(html);
+        if (StringUtil.isNotBlank(htmlContent)) {
+            htmlContent = htmlContent.replaceAll("[   ]", " ");
+            commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
         }
+        commonAnalysisUtil.extractDateByTitle(structureMap, "时间");
         structureMap.put("rec_title=", "107");
 
         return structureMap;

+ 6 - 3
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunLeaveHospitalHtmlAnalysis.java

@@ -2,6 +2,7 @@ package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.lantone.qc.pub.util.StringUtil;
 import org.springframework.beans.factory.annotation.Autowired;
 
 import java.util.List;
@@ -23,9 +24,11 @@ public class BeiLunLeaveHospitalHtmlAnalysis implements BeiLunHtmlAnalysis {
         String html = args[0];
         String recTitle = args[1];
         Map<String, String> structureMap = Maps.newLinkedHashMap();
-        List<String> htmlText = commonAnalysisUtil.html2List(html, true);
-        htmlText.remove(0);//去除第一个div内容
-        commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+        String htmlContent = commonAnalysisUtil.html2String(html);
+        if (StringUtil.isNotBlank(htmlContent)) {
+            htmlContent = htmlContent.replaceAll("[   ]", " ");
+            commonAnalysisUtil.html2StructureMap(titles, htmlContent, structureMap);
+        }
         structureMap.put("rec_title=", "183");
         return structureMap;
     }

+ 90 - 0
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunThreeLevelWardHtmlAnalysis.java

@@ -0,0 +1,90 @@
+package com.lantone.qc.trans.beilun.util;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.Elements;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @Description: 自定义病程记录html解析
+ * @author: HUJING
+ * @time: 2020/9/15 10:28
+ */
+public class BeiLunThreeLevelWardHtmlAnalysis implements BeiLunHtmlAnalysis {
+    @Autowired
+    CommonAnalysisUtil commonAnalysisUtil;
+
+    @Override
+    public Map<String, String> analysis(String... args) {
+        String html = args[0];
+        String recTitle = args[1];
+        Map<String, String> structureMap = Maps.newLinkedHashMap();
+        List<String> htmlText = html2List(html, true);
+        String dateTitle = htmlText.get(0).replaceAll("[   ]", " ");
+        String date = commonAnalysisUtil.extractDate(dateTitle);
+        if (date != null) {
+            structureMap.put("查房日期", date);
+            String title = dateTitle.replace(date, "").trim();
+            structureMap.put("查房标题", title);
+        }
+        htmlText.remove(0);
+        StringBuffer sb = new StringBuffer();
+        for (String line : htmlText) {
+            String text = line.replaceAll("[   ]", " ");
+            if (text.length() == 0) {
+                continue;
+            }
+            sb.append(text).append("\n");
+        }
+        structureMap.put("病情记录", sb.toString());
+        structureMap.put("rec_title=", "5254");
+        return structureMap;
+    }
+
+    /**
+     * 将html内容以行为单位存进list,从<hr>之后开始处理
+     *
+     * @param html 原始html内容
+     * @return
+     */
+    public static List<String> html2List(String html, boolean existHr) {
+        List<String> htmlText = Lists.newArrayList();
+        Document document = Jsoup.parse(html);
+        Element body = document.select("body").first();
+        List<Node> nodes = body.childNodes();
+        List<Node> subNodes = nodes.get(0).childNodes();
+        boolean findNode = false;
+        for (Node node : subNodes) {
+            if ("hr".equals(node.nodeName())) {
+                findNode = true;
+                continue;
+            }
+            if (findNode || !existHr) {
+                String title = node.attr("title");
+                if ("main".equals(title)) {
+                    Element element = (Element) node;
+                    Elements elements = element.select("div");
+                    for (Element e : elements) {
+                        String text = e.text();
+                        if (text.length() > 150) {
+                            continue;
+                        }
+                        htmlText.add(text);
+                    }
+                }
+            }
+        }
+        if (htmlText.get(0).length() > 200) {
+            htmlText.remove(0);
+        }
+        return htmlText;
+    }
+
+}

+ 29 - 7
trans/src/main/java/com/lantone/qc/trans/beilun/util/CommonAnalysisUtil.java

@@ -21,6 +21,30 @@ import java.util.regex.Pattern;
  */
 public class CommonAnalysisUtil {
 
+    /**
+     * 取文书中全部文本
+     *
+     * @param html
+     * @return
+     */
+    public String html2String(String html) {
+        Document document = Jsoup.parse(html);
+        Element body = document.select("body").first();
+        List<Node> nodes = body.childNodes();
+        Node node = nodes.get(0);
+        String htmlContent = null;
+        if (node instanceof Element) {
+            Element element = (Element) node;
+            htmlContent = element.text();
+        }
+        return htmlContent;
+    }
+
+    public void html2StructureMap(List<String> titles, String htmlText, Map<String, String> structureMap) {
+        sortTitles(titles, htmlText);
+        cutByTitles(htmlText, titles, 0, structureMap);
+    }
+
     /**
      * 将html内容以行为单位存进list,从<hr>之后开始处理
      *
@@ -91,9 +115,7 @@ public class CommonAnalysisUtil {
             }
             sb.append(text).append("\n");
         }
-        String content = sb.toString();
-        sortTitles(titles, content);
-        cutByTitles(sb.toString(), titles, 0, structureMap);
+        html2StructureMap(titles, sb.toString(), structureMap);
     }
 
     /**
@@ -112,7 +134,7 @@ public class CommonAnalysisUtil {
         beforeTitle = titles.get(Math.max(depth - 1, 0));
         title = titles.get(Math.min(depth, titles.size() - 1));
         if (depth == titles.size()) {
-            value = line.substring(0, line.indexOf("\n"));
+            value = line;
             structureMap.put(beforeTitle, value.trim());
             return;
         }
@@ -227,7 +249,7 @@ public class CommonAnalysisUtil {
      * @return
      */
     public String extractDate(String top) {
-        Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}[ ][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?");
+        Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}([ ][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?)?");
         Matcher matcher = pattern.matcher(top);
         if (matcher.find()) {
             return matcher.group(0);
@@ -286,8 +308,8 @@ public class CommonAnalysisUtil {
             }
         }
 
-        for (int i : index) {
-            htmlList.remove(i);
+        for (int i = 0; i < index.size(); i++) {
+            htmlList.remove(index.get(i) - i);
         }
     }
 }