浏览代码

1.北仑医院首次病程录解析html
2.解析html方法优化

huj 4 年之前
父节点
当前提交
2c2dfcc26c

+ 69 - 0
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunConsultationHtmlAnalysis.java

@@ -0,0 +1,69 @@
+package com.lantone.qc.trans.beilun.util;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.lantone.qc.pub.util.StringUtil;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @Description:
+ * @author: rengb
+ * @time: 2020/9/12 13:53
+ */
+public class BeiLunConsultationHtmlAnalysis implements BeiLunHtmlAnalysis {
+    @Autowired
+    CommonAnalysisUtil commonAnalysisUtil;
+
+    @Override
+    public Map<String, String> analysis(String... args) {
+        List<String> titles = Lists.newArrayList("会诊类型", "被邀医院", "被邀科室", "申请时间",
+                "患者病情及诊疗经过、申请会诊的理由及目的", "申请会诊科别", "被邀会诊科别", "申请会诊医师", "会诊意见", "会诊时间", "科主任",
+                "会诊到达时间", "查体", "会诊建议", "会诊诊断", "会诊科室", "会诊医师", "外院会诊医师所在医疗机构名称", "会诊医师所在医疗机构名称");
+        String html = args[0];
+        String recTitle = args[1];
+        Map<String, String> structureMap = Maps.newLinkedHashMap();
+        List<String> htmlText = null;
+        switch (recTitle) {
+            case "377":
+                htmlText = commonAnalysisUtil.html2List(html, true);
+                commonAnalysisUtil.removeRepeat(htmlText);
+                commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+                structureMap.put("rec_title=", recTitle);
+                break;
+            case "7883":
+                htmlText = commonAnalysisUtil.html2List(html, false);
+                commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+                if (StringUtil.isNotBlank(structureMap.get("会诊时间"))) {
+                    String[] strArr = structureMap.get("会诊时间").split(" ");
+                    structureMap.put("会诊意见", strArr[strArr.length - 1]);
+                    structureMap.put("会诊时间", strArr[0]);
+                }
+                structureMap.put("rec_title=", recTitle);
+                break;
+            case "8084":
+                titles = Lists.newArrayList("姓名", "性别", "出生日期", "联系电话",
+                        "申请科室","入院/首诊时间", "住院号", "病情概述(含主诉、病史、诊断、诊治过程等)", "拟申请MDT时间、地点",
+                        "拟请MDT参加科室", "MDT目的", "申请人签名", "申请递交时间", "科主任签字", "专家诊治建议", "专家科室", "签名",
+                        "填写时间","主持科室小结(MDT的最终诊治决议)","科主任(主持人)签名","记录人(主管医师)签字","日期");
+                htmlText = Lists.newArrayList();
+                commonAnalysisUtil.html2ListByTable(html, htmlText);
+                htmlText = htmlText.stream().distinct().collect(Collectors.toList());
+                commonAnalysisUtil.html2StructureMapNoColon(titles, htmlText, structureMap);
+                structureMap.put("rec_title=", recTitle);
+                break;
+            default:
+                htmlText = commonAnalysisUtil.html2List(html, true);
+                commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
+                structureMap.put("rec_title=", "371");
+                break;
+        }
+        commonAnalysisUtil.extractDateByTitle(structureMap, "会诊到达时间");
+        commonAnalysisUtil.processType(structureMap, "会诊类型");
+        return structureMap;
+    }
+
+}

+ 5 - 3
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunFirstCourseRecordHtmlAnalysis.java

@@ -1,6 +1,7 @@
 package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import com.lantone.qc.pub.util.StringUtil;
 import org.springframework.beans.factory.annotation.Autowired;
 
@@ -9,7 +10,7 @@ import java.util.Map;
 
 /**
  * @Description:
- * @author: rengb
+ * @author: HUJING
  * @time: 2020/9/9 11:24
  */
 public class BeiLunFirstCourseRecordHtmlAnalysis implements BeiLunHtmlAnalysis {
@@ -22,8 +23,9 @@ public class BeiLunFirstCourseRecordHtmlAnalysis implements BeiLunHtmlAnalysis {
                 "(四)初步诊断", "(五)诊疗计划", "医生签名");
         String html = args[0];
         String recTitle = args[1];
-        List<String> htmlText = commonAnalysisUtil.html2List(html);
-        Map<String, String> structureMap = commonAnalysisUtil.html2StructureMap(titles, htmlText);
+        Map<String, String> structureMap = Maps.newLinkedHashMap();
+        List<String> htmlText = commonAnalysisUtil.html2List(html, true);
+        commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
         String date = commonAnalysisUtil.extractDate(htmlText.get(0));
         if (StringUtil.isNotBlank(date)) {
             structureMap.put("时间", date);

+ 5 - 3
trans/src/main/java/com/lantone/qc/trans/beilun/util/BeiLunLeaveHospitalHtmlAnalysis.java

@@ -1,6 +1,7 @@
 package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import org.springframework.beans.factory.annotation.Autowired;
 
 import java.util.List;
@@ -8,7 +9,7 @@ import java.util.Map;
 
 /**
  * @Description:
- * @author: rengb
+ * @author: HUJING
  * @time: 2020/9/9 11:24
  */
 public class BeiLunLeaveHospitalHtmlAnalysis implements BeiLunHtmlAnalysis {
@@ -21,9 +22,10 @@ public class BeiLunLeaveHospitalHtmlAnalysis implements BeiLunHtmlAnalysis {
                 "入院情况", "入院后检查化验结果", "诊疗经过", "出院计划", "病理检查结果", "出院情况", "治疗效果", "出院医嘱", "医师签名", "时间");
         String html = args[0];
         String recTitle = args[1];
-        List<String> htmlText = commonAnalysisUtil.html2List(html);
+        Map<String, String> structureMap = Maps.newLinkedHashMap();
+        List<String> htmlText = commonAnalysisUtil.html2List(html, true);
         htmlText.remove(0);//去除第一个div内容
-        Map<String, String> structureMap = commonAnalysisUtil.html2StructureMap(titles, htmlText);
+        commonAnalysisUtil.html2StructureMap(titles, htmlText, structureMap);
         structureMap.put("rec_title=", "183");
         return structureMap;
     }

+ 181 - 12
trans/src/main/java/com/lantone/qc/trans/beilun/util/CommonAnalysisUtil.java

@@ -1,7 +1,7 @@
 package com.lantone.qc.trans.beilun.util;
 
 import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
+import com.lantone.qc.pub.util.StringUtil;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@@ -10,6 +10,7 @@ import org.jsoup.select.Elements;
 
 import java.util.List;
 import java.util.Map;
+import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -26,7 +27,7 @@ public class CommonAnalysisUtil {
      * @param html 原始html内容
      * @return
      */
-    public List<String> html2List(String html) {
+    public List<String> html2List(String html, boolean existHr) {
         List<String> htmlText = Lists.newArrayList();
         Document document = Jsoup.parse(html);
         Element body = document.select("body").first();
@@ -38,11 +39,15 @@ public class CommonAnalysisUtil {
                 findNode = true;
                 continue;
             }
-            if (findNode) {
+            if (findNode || !existHr) {
                 Element element = (Element) node;
                 Elements elements = element.select("div");
                 for (Element e : elements) {
-                    htmlText.add(e.text());
+                    String text = e.text();
+                    if (text.length() > 150) {
+                        continue;
+                    }
+                    htmlText.add(text);
                 }
             }
         }
@@ -52,15 +57,32 @@ public class CommonAnalysisUtil {
         return htmlText;
     }
 
+    /**
+     * 将html内容以table的格式存进list
+     *
+     * @param html 原始html内容
+     * @return
+     */
+    public void html2ListByTable(String html, List<String> htmlText) {
+        Elements trs = Jsoup.parse(html).select("table").select("tr");
+        for (int i = 0; i < trs.size(); i++) {
+            Elements tds = trs.get(i).select("td");
+            for (int j = 0; j < tds.size(); j++) {
+                String text = tds.get(j).text();
+                htmlText.add(text);
+            }
+        }
+    }
+
     /**
      * 将list中html内容转换成structureMap
      *
-     * @param titles   文书各标题
-     * @param htmlText html内容以行的形式存储list
+     * @param titles       文书各标题
+     * @param htmlText     html内容以行的形式存储list
+     * @param structureMap
      * @return
      */
-    public Map<String, String> html2StructureMap(List<String> titles, List<String> htmlText) {
-        Map<String, String> structureMap = Maps.newLinkedHashMap();
+    public void html2StructureMap(List<String> titles, List<String> htmlText, Map<String, String> structureMap) {
         StringBuffer sb = new StringBuffer();
         for (String line : htmlText) {
             String text = line.replaceAll("[   ]", " ");
@@ -69,8 +91,9 @@ public class CommonAnalysisUtil {
             }
             sb.append(text).append("\n");
         }
+        String content = sb.toString();
+        sortTitles(titles, content);
         cutByTitles(sb.toString(), titles, 0, structureMap);
-        return structureMap;
     }
 
     /**
@@ -88,13 +111,17 @@ public class CommonAnalysisUtil {
         String beforeTitle = null, title = null, newTitle = null, value = null;
         beforeTitle = titles.get(Math.max(depth - 1, 0));
         title = titles.get(Math.min(depth, titles.size() - 1));
-        newTitle = title + ":";
         if (depth == titles.size()) {
             value = line.substring(0, line.indexOf("\n"));
             structureMap.put(beforeTitle, value.trim());
             return;
         }
-        if (line.contains(newTitle)) {
+        if (line.contains(title + ":") || line.contains(title + ":")) {
+            if (line.contains(title + ":")) {
+                newTitle = title + ":";
+            } else {
+                newTitle = title + ":";
+            }
             if (depth > 0) {
                 value = line.substring(0, line.indexOf(newTitle));
                 structureMap.put(beforeTitle, value.trim());
@@ -107,6 +134,92 @@ public class CommonAnalysisUtil {
         cutByTitles(line, titles, depth, structureMap);
     }
 
+    /**
+     * 将title根据在文本中的位置排序
+     *
+     * @param titles
+     * @param content
+     * @return
+     */
+    public List<String> sortTitles(List<String> titles, String content) {
+        Map<Integer, String> titleIndex = new TreeMap<>();
+        int index, index_1, index_2;
+        for (String title : titles) {
+            index_1 = content.indexOf(title + ":");
+            index_2 = content.indexOf(title + ":");
+            index = Math.max(index_1, index_2);
+            if (index != -1) {
+                titleIndex.put(index, title);
+                content = content.substring(0, index) + content.substring(index + title.length() + 1);
+            }
+        }
+        titles = Lists.newArrayList(titleIndex.values());
+        return titles;
+    }
+
+    /**
+     * 标题没有冒号版本
+     */
+    public void html2StructureMapNoColon(List<String> titles, List<String> htmlText, Map<String, String> structureMap) {
+        StringBuffer sb = new StringBuffer();
+        for (String line : htmlText) {
+            String text = line.replaceAll("[   ]", " ");
+            if (text.length() == 0) {
+                continue;
+            }
+            sb.append(text).append("\n");
+        }
+        String content = sb.toString();
+        List<String> sortTitles = sortTitlesNoColon(titles, content);
+        cutByTitlesNoColon(sb.toString(), sortTitles, 0, structureMap);
+    }
+
+    /**
+     * 标题没有冒号版本
+     */
+    private void cutByTitlesNoColon(String line, List<String> titles, int depth, Map<String, String> structureMap) {
+        if (depth > titles.size()) {
+            return;
+        }
+        String beforeTitle = null, title = null, newTitle = null, value = null;
+        beforeTitle = titles.get(Math.max(depth - 1, 0));
+        title = titles.get(Math.min(depth, titles.size() - 1));
+        if (depth == titles.size()) {
+            value = line.substring(0, line.indexOf("\n"));
+            structureMap.put(beforeTitle, value.trim());
+            return;
+        }
+        if (line.contains(title)) {
+            newTitle = title;
+            if (depth > 0) {
+                value = line.substring(0, line.indexOf(newTitle));
+                structureMap.put(beforeTitle, value.trim());
+            }
+            line = line.substring(line.indexOf(newTitle) + newTitle.length());
+            depth++;
+        } else {
+            titles.remove(depth);
+        }
+        cutByTitlesNoColon(line, titles, depth, structureMap);
+    }
+
+    /**
+     * 标题没有冒号版本
+     */
+    public List<String> sortTitlesNoColon(List<String> titles, String content) {
+        Map<Integer, String> titleIndex = new TreeMap<>();
+        int index;
+        for (String title : titles) {
+            index = content.indexOf(title);
+            if (index != -1) {
+                titleIndex.put(index, title);
+                content = content.replace(title, "");
+            }
+        }
+        titles = Lists.newArrayList(titleIndex.values());
+        return titles;
+    }
+
     /**
      * 抽取文本中的第一个时间
      *
@@ -114,11 +227,67 @@ public class CommonAnalysisUtil {
      * @return
      */
     public String extractDate(String top) {
-        Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}[][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?");
+        Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}[ ][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?");
         Matcher matcher = pattern.matcher(top);
         if (matcher.find()) {
             return matcher.group(0);
         }
         return null;
     }
+
+    /**
+     * 根据title重新存放时间
+     *
+     * @param structmap
+     * @param title
+     */
+    public void extractDateByTitle(Map<String, String> structmap, String title) {
+        if (structmap.containsKey(title)) {
+            String date = extractDate(structmap.get(title));
+            if (StringUtil.isNotBlank(date)) {
+                structmap.put(title, date);
+            }
+        }
+    }
+
+    /**
+     * 若内容中是包含选择框(会诊类型:     急会诊       普通会诊         请院外会诊),特殊处理
+     *
+     * @param structureMap
+     */
+    public void processType(Map<String, String> structureMap, String title) {
+        if (structureMap.containsKey(title)) {
+            String type = structureMap.get(title);
+            String[] types = type.split(" ");
+            for (String t : types) {
+                if (t.contains("\uF0FE")) {
+                    structureMap.put(title, t.replace("\uF0FE", ""));
+                    break;
+                }
+            }
+        }
+    }
+
+    /**
+     * 若list中其中一个元素包含之后第二个、第三个元素的文本,则把这个元素删除
+     *
+     * @param htmlList
+     */
+    public void removeRepeat(List<String> htmlList) {
+        List<Integer> index = Lists.newArrayList();
+        if (htmlList.size() < 3) return;
+        String str1 = null, str2 = null, str3 = null;
+        for (int i = 0; i < htmlList.size() - 2; i++) {
+            str1 = htmlList.get(i);
+            str2 = htmlList.get(i + 1);
+            str3 = htmlList.get(i + 2);
+            if (str1.contains(str2) && str1.contains(str3)) {
+                index.add(i);
+            }
+        }
+
+        for (int i : index) {
+            htmlList.remove(i);
+        }
+    }
 }