|
@@ -10,6 +10,8 @@ import org.jsoup.select.Elements;
|
|
|
|
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Map;
|
|
|
|
+import java.util.regex.Matcher;
|
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
|
|
/**
|
|
/**
|
|
* @Description :
|
|
* @Description :
|
|
@@ -19,7 +21,7 @@ import java.util.Map;
|
|
public class CommonAnalysisUtil {
|
|
public class CommonAnalysisUtil {
|
|
|
|
|
|
/**
|
|
/**
|
|
- * 将html内容以行为单位存进list
|
|
|
|
|
|
+ * 将html内容以行为单位存进list,从<hr>之后开始处理
|
|
*
|
|
*
|
|
* @param html 原始html内容
|
|
* @param html 原始html内容
|
|
* @return
|
|
* @return
|
|
@@ -44,6 +46,9 @@ public class CommonAnalysisUtil {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ if (htmlText.get(0).length() > 200) {
|
|
|
|
+ htmlText.remove(0);
|
|
|
|
+ }
|
|
return htmlText;
|
|
return htmlText;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -58,7 +63,7 @@ public class CommonAnalysisUtil {
|
|
Map<String, String> structureMap = Maps.newLinkedHashMap();
|
|
Map<String, String> structureMap = Maps.newLinkedHashMap();
|
|
StringBuffer sb = new StringBuffer();
|
|
StringBuffer sb = new StringBuffer();
|
|
for (String line : htmlText) {
|
|
for (String line : htmlText) {
|
|
- String text = line.replaceAll("[ ]", "");
|
|
|
|
|
|
+ String text = line.replaceAll("[ ]", " ");
|
|
if (text.length() == 0) {
|
|
if (text.length() == 0) {
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
@@ -101,4 +106,19 @@ public class CommonAnalysisUtil {
|
|
}
|
|
}
|
|
cutByTitles(line, titles, depth, structureMap);
|
|
cutByTitles(line, titles, depth, structureMap);
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 抽取文本中的第一个时间
|
|
|
|
+ *
|
|
|
|
+ * @param top
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public String extractDate(String top) {
|
|
|
|
+ Pattern pattern = Pattern.compile("[0-9]{4}[-][0-9]{1,2}[-][0-9]{1,2}[ ][0-9]{1,2}[:][0-9]{1,2}([:][0-9]{1,2})?");
|
|
|
|
+ Matcher matcher = pattern.matcher(top);
|
|
|
|
+ if (matcher.find()) {
|
|
|
|
+ return matcher.group(0);
|
|
|
|
+ }
|
|
|
|
+ return null;
|
|
|
|
+ }
|
|
}
|
|
}
|