|
@@ -1,7 +1,7 @@
|
|
|
package com.lantone.qc.trans.beilun.util;
|
|
|
|
|
|
import com.google.common.collect.Lists;
|
|
|
-import com.google.common.collect.Maps;
|
|
|
+import com.lantone.qc.pub.util.StringUtil;
|
|
|
import org.jsoup.Jsoup;
|
|
|
import org.jsoup.nodes.Document;
|
|
|
import org.jsoup.nodes.Element;
|
|
@@ -10,6 +10,7 @@ import org.jsoup.select.Elements;
|
|
|
|
|
|
import java.util.List;
|
|
|
import java.util.Map;
|
|
|
+import java.util.TreeMap;
|
|
|
import java.util.regex.Matcher;
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
@@ -26,7 +27,7 @@ public class CommonAnalysisUtil {
|
|
|
* @param html 原始html内容
|
|
|
* @return
|
|
|
*/
|
|
|
- public List<String> html2List(String html) {
|
|
|
+ public List<String> html2List(String html, boolean existHr) {
|
|
|
List<String> htmlText = Lists.newArrayList();
|
|
|
Document document = Jsoup.parse(html);
|
|
|
Element body = document.select("body").first();
|
|
@@ -38,11 +39,15 @@ public class CommonAnalysisUtil {
|
|
|
findNode = true;
|
|
|
continue;
|
|
|
}
|
|
|
- if (findNode) {
|
|
|
+ if (findNode || !existHr) {
|
|
|
Element element = (Element) node;
|
|
|
Elements elements = element.select("div");
|
|
|
for (Element e : elements) {
|
|
|
- htmlText.add(e.text());
|
|
|
+ String text = e.text();
|
|
|
+ if (text.length() > 150) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ htmlText.add(text);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -52,15 +57,32 @@ public class CommonAnalysisUtil {
|
|
|
return htmlText;
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 将html内容以table的格式存进list
|
|
|
+ *
|
|
|
+ * @param html 原始html内容
|
|
|
+ * @return
|
|
|
+ */
|
|
|
+ public void html2ListByTable(String html, List<String> htmlText) {
|
|
|
+ Elements trs = Jsoup.parse(html).select("table").select("tr");
|
|
|
+ for (int i = 0; i < trs.size(); i++) {
|
|
|
+ Elements tds = trs.get(i).select("td");
|
|
|
+ for (int j = 0; j < tds.size(); j++) {
|
|
|
+ String text = tds.get(j).text();
|
|
|
+ htmlText.add(text);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 将list中html内容转换成structureMap
|
|
|
*
|
|
|
- * @param titles 文书各标题
|
|
|
- * @param htmlText html内容以行的形式存储list
|
|
|
+ * @param titles 文书各标题
|
|
|
+ * @param htmlText html内容以行的形式存储list
|
|
|
+ * @param structureMap
|
|
|
* @return
|
|
|
*/
|
|
|
- public Map<String, String> html2StructureMap(List<String> titles, List<String> htmlText) {
|
|
|
- Map<String, String> structureMap = Maps.newLinkedHashMap();
|
|
|
+ public void html2StructureMap(List<String> titles, List<String> htmlText, Map<String, String> structureMap) {
|
|
|
StringBuffer sb = new StringBuffer();
|
|
|
for (String line : htmlText) {
|
|
|
String text = line.replaceAll("[ ]", " ");
|
|
@@ -69,8 +91,9 @@ public class CommonAnalysisUtil {
|
|
|
}
|
|
|
sb.append(text).append("\n");
|
|
|
}
|
|
|
+ String content = sb.toString();
|
|
|
+ sortTitles(titles, content);
|
|
|
cutByTitles(sb.toString(), titles, 0, structureMap);
|
|
|
- return structureMap;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -88,13 +111,17 @@ public class CommonAnalysisUtil {
|
|
|
String beforeTitle = null, title = null, newTitle = null, value = null;
|
|
|
beforeTitle = titles.get(Math.max(depth - 1, 0));
|
|
|
title = titles.get(Math.min(depth, titles.size() - 1));
|
|
|
- newTitle = title + ":";
|
|
|
if (depth == titles.size()) {
|
|
|
value = line.substring(0, line.indexOf("\n"));
|
|
|
structureMap.put(beforeTitle, value.trim());
|
|
|
return;
|
|
|
}
|
|
|
- if (line.contains(newTitle)) {
|
|
|
+ if (line.contains(title + ":") || line.contains(title + ":")) {
|
|
|
+ if (line.contains(title + ":")) {
|
|
|
+ newTitle = title + ":";
|
|
|
+ } else {
|
|
|
+ newTitle = title + ":";
|
|
|
+ }
|
|
|
if (depth > 0) {
|
|
|
value = line.substring(0, line.indexOf(newTitle));
|
|
|
structureMap.put(beforeTitle, value.trim());
|
|
@@ -107,6 +134,92 @@ public class CommonAnalysisUtil {
|
|
|
cutByTitles(line, titles, depth, structureMap);
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 将title根据在文本中的位置排序
|
|
|
+ *
|
|
|
+ * @param titles
|
|
|
+ * @param content
|
|
|
+ * @return
|
|
|
+ */
|
|
|
+ public List<String> sortTitles(List<String> titles, String content) {
|
|
|
+ Map<Integer, String> titleIndex = new TreeMap<>();
|
|
|
+ int index, index_1, index_2;
|
|
|
+ for (String title : titles) {
|
|
|
+ index_1 = content.indexOf(title + ":");
|
|
|
+ index_2 = content.indexOf(title + ":");
|
|
|
+ index = Math.max(index_1, index_2);
|
|
|
+ if (index != -1) {
|
|
|
+ titleIndex.put(index, title);
|
|
|
+ content = content.substring(0, index) + content.substring(index + title.length() + 1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ titles = Lists.newArrayList(titleIndex.values());
|
|
|
+ return titles;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 标题没有冒号版本
|
|
|
+ */
|
|
|
+ public void html2StructureMapNoColon(List<String> titles, List<String> htmlText, Map<String, String> structureMap) {
|
|
|
+ StringBuffer sb = new StringBuffer();
|
|
|
+ for (String line : htmlText) {
|
|
|
+ String text = line.replaceAll("[ ]", " ");
|
|
|
+ if (text.length() == 0) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ sb.append(text).append("\n");
|
|
|
+ }
|
|
|
+ String content = sb.toString();
|
|
|
+ List<String> sortTitles = sortTitlesNoColon(titles, content);
|
|
|
+ cutByTitlesNoColon(sb.toString(), sortTitles, 0, structureMap);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 标题没有冒号版本
|
|
|
+ */
|
|
|
+ private void cutByTitlesNoColon(String line, List<String> titles, int depth, Map<String, String> structureMap) {
|
|
|
+ if (depth > titles.size()) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ String beforeTitle = null, title = null, newTitle = null, value = null;
|
|
|
+ beforeTitle = titles.get(Math.max(depth - 1, 0));
|
|
|
+ title = titles.get(Math.min(depth, titles.size() - 1));
|
|
|
+ if (depth == titles.size()) {
|
|
|
+ value = line.substring(0, line.indexOf("\n"));
|
|
|
+ structureMap.put(beforeTitle, value.trim());
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ if (line.contains(title)) {
|
|
|
+ newTitle = title;
|
|
|
+ if (depth > 0) {
|
|
|
+ value = line.substring(0, line.indexOf(newTitle));
|
|
|
+ structureMap.put(beforeTitle, value.trim());
|
|
|
+ }
|
|
|
+ line = line.substring(line.indexOf(newTitle) + newTitle.length());
|
|
|
+ depth++;
|
|
|
+ } else {
|
|
|
+ titles.remove(depth);
|
|
|
+ }
|
|
|
+ cutByTitlesNoColon(line, titles, depth, structureMap);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 标题没有冒号版本
|
|
|
+ */
|
|
|
+ public List<String> sortTitlesNoColon(List<String> titles, String content) {
|
|
|
+ Map<Integer, String> titleIndex = new TreeMap<>();
|
|
|
+ int index;
|
|
|
+ for (String title : titles) {
|
|
|
+ index = content.indexOf(title);
|
|
|
+ if (index != -1) {
|
|
|
+ titleIndex.put(index, title);
|
|
|
+ content = content.replace(title, "");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ titles = Lists.newArrayList(titleIndex.values());
|
|
|
+ return titles;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 抽取文本中的第一个时间
|
|
|
*
|
|
@@ -114,11 +227,67 @@ public class CommonAnalysisUtil {
|
|
|
* @return
|
|
|
*/
|
|
|
public String extractDate(String top) {
|
|
|
- Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}[ ][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?");
|
|
|
+ Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}[ ][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?");
|
|
|
Matcher matcher = pattern.matcher(top);
|
|
|
if (matcher.find()) {
|
|
|
return matcher.group(0);
|
|
|
}
|
|
|
return null;
|
|
|
}
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 根据title重新存放时间
|
|
|
+ *
|
|
|
+ * @param structmap
|
|
|
+ * @param title
|
|
|
+ */
|
|
|
+ public void extractDateByTitle(Map<String, String> structmap, String title) {
|
|
|
+ if (structmap.containsKey(title)) {
|
|
|
+ String date = extractDate(structmap.get(title));
|
|
|
+ if (StringUtil.isNotBlank(date)) {
|
|
|
+ structmap.put(title, date);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 若内容中是包含选择框(会诊类型: 急会诊 普通会诊 请院外会诊),特殊处理
|
|
|
+ *
|
|
|
+ * @param structureMap
|
|
|
+ */
|
|
|
+ public void processType(Map<String, String> structureMap, String title) {
|
|
|
+ if (structureMap.containsKey(title)) {
|
|
|
+ String type = structureMap.get(title);
|
|
|
+ String[] types = type.split(" ");
|
|
|
+ for (String t : types) {
|
|
|
+ if (t.contains("\uF0FE")) {
|
|
|
+ structureMap.put(title, t.replace("\uF0FE", ""));
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 若list中其中一个元素包含之后第二个、第三个元素的文本,则把这个元素删除
|
|
|
+ *
|
|
|
+ * @param htmlList
|
|
|
+ */
|
|
|
+ public void removeRepeat(List<String> htmlList) {
|
|
|
+ List<Integer> index = Lists.newArrayList();
|
|
|
+ if (htmlList.size() < 3) return;
|
|
|
+ String str1 = null, str2 = null, str3 = null;
|
|
|
+ for (int i = 0; i < htmlList.size() - 2; i++) {
|
|
|
+ str1 = htmlList.get(i);
|
|
|
+ str2 = htmlList.get(i + 1);
|
|
|
+ str3 = htmlList.get(i + 2);
|
|
|
+ if (str1.contains(str2) && str1.contains(str3)) {
|
|
|
+ index.add(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int i : index) {
|
|
|
+ htmlList.remove(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|