|
@@ -1,6 +1,12 @@
|
|
package com.lantone.qc.trans.beilun.util;
|
|
package com.lantone.qc.trans.beilun.util;
|
|
|
|
|
|
|
|
+import com.google.common.collect.Lists;
|
|
import com.google.common.collect.Maps;
|
|
import com.google.common.collect.Maps;
|
|
|
|
+import org.jsoup.Jsoup;
|
|
|
|
+import org.jsoup.nodes.Document;
|
|
|
|
+import org.jsoup.nodes.Element;
|
|
|
|
+import org.jsoup.nodes.Node;
|
|
|
|
+import org.jsoup.select.Elements;
|
|
|
|
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Map;
|
|
@@ -11,15 +17,45 @@ import java.util.Map;
|
|
* @Date: 2020/9/10 13:48
|
|
* @Date: 2020/9/10 13:48
|
|
*/
|
|
*/
|
|
public class CommonAnalysisUtil {
|
|
public class CommonAnalysisUtil {
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 将html内容以行为单位存进list
|
|
|
|
+ *
|
|
|
|
+ * @param html 原始html内容
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public List<String> html2List(String html) {
|
|
|
|
+ List<String> htmlText = Lists.newArrayList();
|
|
|
|
+ Document document = Jsoup.parse(html);
|
|
|
|
+ Element body = document.select("body").first();
|
|
|
|
+ List<Node> nodes = body.childNodes();
|
|
|
|
+ List<Node> subNodes = nodes.get(0).childNodes();
|
|
|
|
+ boolean findNode = false;
|
|
|
|
+ for (Node node : subNodes) {
|
|
|
|
+ if ("hr".equals(node.nodeName())) {
|
|
|
|
+ findNode = true;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ if (findNode) {
|
|
|
|
+ Element element = (Element) node;
|
|
|
|
+ Elements elements = element.select("div");
|
|
|
|
+ for (Element e : elements) {
|
|
|
|
+ htmlText.add(e.text());
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return htmlText;
|
|
|
|
+ }
|
|
|
|
+
|
|
/**
|
|
/**
|
|
- * 将html内容转换成structureMap
|
|
|
|
|
|
+ * 将list中html内容转换成structureMap
|
|
*
|
|
*
|
|
- * @param titles
|
|
|
|
- * @param htmlText
|
|
|
|
|
|
+ * @param titles 文书各标题
|
|
|
|
+ * @param htmlText html内容以行的形式存储list
|
|
* @return
|
|
* @return
|
|
*/
|
|
*/
|
|
public Map<String, String> html2StructureMap(List<String> titles, List<String> htmlText) {
|
|
public Map<String, String> html2StructureMap(List<String> titles, List<String> htmlText) {
|
|
- Map<String, String> structmap = Maps.newLinkedHashMap();
|
|
|
|
|
|
+ Map<String, String> structureMap = Maps.newLinkedHashMap();
|
|
StringBuffer sb = new StringBuffer();
|
|
StringBuffer sb = new StringBuffer();
|
|
for (String line : htmlText) {
|
|
for (String line : htmlText) {
|
|
String text = line.replaceAll("[ ]", "");
|
|
String text = line.replaceAll("[ ]", "");
|
|
@@ -28,19 +64,19 @@ public class CommonAnalysisUtil {
|
|
}
|
|
}
|
|
sb.append(text).append("\n");
|
|
sb.append(text).append("\n");
|
|
}
|
|
}
|
|
- cutByTitles(sb.toString(), titles, 0, structmap);
|
|
|
|
- return structmap;
|
|
|
|
|
|
+ cutByTitles(sb.toString(), titles, 0, structureMap);
|
|
|
|
+ return structureMap;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
* 根据文书各标题截取相应文本,存入structmap中
|
|
* 根据文书各标题截取相应文本,存入structmap中
|
|
*
|
|
*
|
|
- * @param line 原始文本
|
|
|
|
- * @param titles 文书各标题
|
|
|
|
- * @param depth 递归深度,也就是titles取值时的下标值
|
|
|
|
- * @param structmap 存储结构化数据
|
|
|
|
|
|
+ * @param line 原始文本
|
|
|
|
+ * @param titles 文书各标题
|
|
|
|
+ * @param depth 递归深度,也就是titles取值时的下标值
|
|
|
|
+ * @param structureMap 存储结构化数据
|
|
*/
|
|
*/
|
|
- private void cutByTitles(String line, List<String> titles, int depth, Map<String, String> structmap) {
|
|
|
|
|
|
+ private void cutByTitles(String line, List<String> titles, int depth, Map<String, String> structureMap) {
|
|
if (depth > titles.size()) {
|
|
if (depth > titles.size()) {
|
|
return;
|
|
return;
|
|
}
|
|
}
|
|
@@ -50,19 +86,19 @@ public class CommonAnalysisUtil {
|
|
newTitle = title + ":";
|
|
newTitle = title + ":";
|
|
if (depth == titles.size()) {
|
|
if (depth == titles.size()) {
|
|
value = line.substring(0, line.indexOf("\n"));
|
|
value = line.substring(0, line.indexOf("\n"));
|
|
- structmap.put(beforeTitle, value.trim());
|
|
|
|
|
|
+ structureMap.put(beforeTitle, value.trim());
|
|
return;
|
|
return;
|
|
}
|
|
}
|
|
if (line.contains(newTitle)) {
|
|
if (line.contains(newTitle)) {
|
|
if (depth > 0) {
|
|
if (depth > 0) {
|
|
value = line.substring(0, line.indexOf(newTitle));
|
|
value = line.substring(0, line.indexOf(newTitle));
|
|
- structmap.put(beforeTitle, value.trim());
|
|
|
|
|
|
+ structureMap.put(beforeTitle, value.trim());
|
|
}
|
|
}
|
|
line = line.substring(line.indexOf(newTitle) + newTitle.length());
|
|
line = line.substring(line.indexOf(newTitle) + newTitle.length());
|
|
depth++;
|
|
depth++;
|
|
} else {
|
|
} else {
|
|
titles.remove(depth);
|
|
titles.remove(depth);
|
|
}
|
|
}
|
|
- cutByTitles(line, titles, depth, structmap);
|
|
|
|
|
|
+ cutByTitles(line, titles, depth, structureMap);
|
|
}
|
|
}
|
|
}
|
|
}
|