|
@@ -127,7 +127,7 @@ public class CommonAnalysisUtil {
|
|
* @param structureMap 存储结构化数据
|
|
* @param structureMap 存储结构化数据
|
|
*/
|
|
*/
|
|
public static void cutByTitles(String line, List<String> titles, int depth, Map<String, String> structureMap) {
|
|
public static void cutByTitles(String line, List<String> titles, int depth, Map<String, String> structureMap) {
|
|
- if (depth > titles.size()) {
|
|
|
|
|
|
+ if (depth > titles.size() || titles.size() == 0) {
|
|
return;
|
|
return;
|
|
}
|
|
}
|
|
String beforeTitle = null, title = null, newTitle = null, value = null;
|
|
String beforeTitle = null, title = null, newTitle = null, value = null;
|
|
@@ -146,7 +146,7 @@ public class CommonAnalysisUtil {
|
|
}
|
|
}
|
|
if (depth > 0) {
|
|
if (depth > 0) {
|
|
value = line.substring(0, line.indexOf(newTitle));
|
|
value = line.substring(0, line.indexOf(newTitle));
|
|
- structureMap.put(beforeTitle, value.trim());
|
|
|
|
|
|
+ structureMap.put(beforeTitle.replace(" ", ""), value.trim());
|
|
}
|
|
}
|
|
line = line.substring(line.indexOf(newTitle) + newTitle.length());
|
|
line = line.substring(line.indexOf(newTitle) + newTitle.length());
|
|
depth++;
|
|
depth++;
|
|
@@ -182,32 +182,23 @@ public class CommonAnalysisUtil {
|
|
/**
|
|
/**
|
|
* 标题没有冒号版本
|
|
* 标题没有冒号版本
|
|
*/
|
|
*/
|
|
- public void html2StructureMapNoColon(List<String> titles, List<String> htmlText, Map<String, String> structureMap) {
|
|
|
|
- StringBuffer sb = new StringBuffer();
|
|
|
|
- for (String line : htmlText) {
|
|
|
|
- String text = line.replaceAll("[ ]", " ");
|
|
|
|
- if (text.length() == 0) {
|
|
|
|
- continue;
|
|
|
|
- }
|
|
|
|
- sb.append(text).append("\n");
|
|
|
|
- }
|
|
|
|
- String content = sb.toString();
|
|
|
|
- List<String> sortTitles = sortTitlesNoColon(titles, content);
|
|
|
|
- cutByTitlesNoColon(sb.toString(), sortTitles, 0, structureMap);
|
|
|
|
|
|
+ public static void html2StructureMapNoColon(List<String> titles, String htmlText, Map<String, String> structureMap) {
|
|
|
|
+ List<String> sortTitlesNoColon = sortTitlesNoColon(titles, htmlText);
|
|
|
|
+ cutByTitlesNoColon(htmlText, sortTitlesNoColon, 0, structureMap);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
* 标题没有冒号版本
|
|
* 标题没有冒号版本
|
|
*/
|
|
*/
|
|
- private void cutByTitlesNoColon(String line, List<String> titles, int depth, Map<String, String> structureMap) {
|
|
|
|
- if (depth > titles.size()) {
|
|
|
|
|
|
+ private static void cutByTitlesNoColon(String line, List<String> titles, int depth, Map<String, String> structureMap) {
|
|
|
|
+ if (depth > titles.size() || titles.size() == 0) {
|
|
return;
|
|
return;
|
|
}
|
|
}
|
|
String beforeTitle = null, title = null, newTitle = null, value = null;
|
|
String beforeTitle = null, title = null, newTitle = null, value = null;
|
|
beforeTitle = titles.get(Math.max(depth - 1, 0));
|
|
beforeTitle = titles.get(Math.max(depth - 1, 0));
|
|
title = titles.get(Math.min(depth, titles.size() - 1));
|
|
title = titles.get(Math.min(depth, titles.size() - 1));
|
|
if (depth == titles.size()) {
|
|
if (depth == titles.size()) {
|
|
- value = line.substring(0, line.indexOf("\n"));
|
|
|
|
|
|
+ value = line;
|
|
structureMap.put(beforeTitle, value.trim());
|
|
structureMap.put(beforeTitle, value.trim());
|
|
return;
|
|
return;
|
|
}
|
|
}
|
|
@@ -215,7 +206,7 @@ public class CommonAnalysisUtil {
|
|
newTitle = title;
|
|
newTitle = title;
|
|
if (depth > 0) {
|
|
if (depth > 0) {
|
|
value = line.substring(0, line.indexOf(newTitle));
|
|
value = line.substring(0, line.indexOf(newTitle));
|
|
- structureMap.put(beforeTitle, value.trim());
|
|
|
|
|
|
+ structureMap.put(beforeTitle.replace(" ", ""), value.trim());
|
|
}
|
|
}
|
|
line = line.substring(line.indexOf(newTitle) + newTitle.length());
|
|
line = line.substring(line.indexOf(newTitle) + newTitle.length());
|
|
depth++;
|
|
depth++;
|
|
@@ -228,7 +219,7 @@ public class CommonAnalysisUtil {
|
|
/**
|
|
/**
|
|
* 标题没有冒号版本
|
|
* 标题没有冒号版本
|
|
*/
|
|
*/
|
|
- public List<String> sortTitlesNoColon(List<String> titles, String content) {
|
|
|
|
|
|
+ public static List<String> sortTitlesNoColon(List<String> titles, String content) {
|
|
Map<Integer, String> titleIndex = new TreeMap<>();
|
|
Map<Integer, String> titleIndex = new TreeMap<>();
|
|
int index;
|
|
int index;
|
|
for (String title : titles) {
|
|
for (String title : titles) {
|