|
@@ -5,11 +5,13 @@ import org.diagbot.nlp.participle.cfg.DefaultConfig;
|
|
|
import org.diagbot.nlp.participle.word.Lexeme;
|
|
|
import org.diagbot.nlp.participle.word.LexemePath;
|
|
|
import org.diagbot.nlp.participle.word.Segment;
|
|
|
+import org.diagbot.nlp.util.Constants;
|
|
|
import org.diagbot.nlp.util.NlpUtil;
|
|
|
import org.diagbot.nlp.util.NlpCache;
|
|
|
|
|
|
import java.io.IOException;
|
|
|
import java.io.StringReader;
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
|
|
/**
|
|
|
* Created by fyeman on 2018/1/31.
|
|
@@ -18,35 +20,24 @@ public class ParticipleUtil {
|
|
|
|
|
|
public static LexemePath<Lexeme> participle(String content, boolean isCombineUnit) throws IOException {
|
|
|
LexemePath<Lexeme> lexemes = participle(content);
|
|
|
-
|
|
|
- LexemePath<Lexeme> results = new LexemePath<>();
|
|
|
if (isCombineUnit) {
|
|
|
- Lexeme l = null;
|
|
|
- Lexeme last_l = null;
|
|
|
- for (int i = 0, len = lexemes.size(); i < len; i++) {
|
|
|
- l = lexemes.get(i);
|
|
|
- if (l.getProperty() != null && ("2".equals(l.getProperty()) || "9".equals(l.getProperty()))) {
|
|
|
- if (i > 0) {
|
|
|
- last_l = lexemes.get(i - 1);
|
|
|
- if (NlpUtil.isNumberString(last_l)) {
|
|
|
- last_l.setText(last_l.getText() + l.getText());
|
|
|
- last_l.setProperty(l.getProperty());
|
|
|
- last_l.setLength(last_l.getLength() + l.getLength());
|
|
|
-
|
|
|
- results.remove(results.size() - 1);
|
|
|
- results.add(last_l);
|
|
|
- } else {
|
|
|
- results.add(l);
|
|
|
- }
|
|
|
- } else {
|
|
|
- results.add(l);
|
|
|
+ ParticipleUtil util = new ParticipleUtil();
|
|
|
+ lexemes = util.combineValidate(lexemes);
|
|
|
+ lexemes = util.joinTime(lexemes);
|
|
|
+
|
|
|
+ String year_pattern = "([1-2][0-9]{3}|[0-9]{2})";
|
|
|
+ String mouth_day_pattern = "([0-9]{2}|[0-9])";
|
|
|
+ String join_pattern = "([-/.]?)";
|
|
|
+ String pattern_string = year_pattern + join_pattern + mouth_day_pattern + join_pattern + mouth_day_pattern;
|
|
|
+ for (Lexeme l : lexemes) {
|
|
|
+ if (l.getProperty().equals(Constants.word_property_number)) {
|
|
|
+ if (Pattern.matches(pattern_string,l.getText())) {
|
|
|
+ l.setProperty(Constants.word_property_time);
|
|
|
}
|
|
|
- } else {
|
|
|
- results.add(l);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- return results;
|
|
|
+ return lexemes;
|
|
|
}
|
|
|
|
|
|
public static LexemePath<Lexeme> participle(String content) throws IOException {
|
|
@@ -77,19 +68,117 @@ public class ParticipleUtil {
|
|
|
LexemePath<Lexeme> lexemePath = participle(content, true);
|
|
|
String separator = " * ";
|
|
|
StringBuffer sb = new StringBuffer();
|
|
|
- for (Lexeme lexeme : lexemePath) {
|
|
|
+ Lexeme lexeme = null;
|
|
|
+ boolean new_line = true;
|
|
|
+ for (int i = 0; i < lexemePath.size(); i++) {
|
|
|
+ lexeme = lexemePath.get(i);
|
|
|
if (lexeme.getOffset() > -1) {
|
|
|
- if (lexeme.getProperty() != null && !"99".equals(lexeme.getProperty())) {
|
|
|
- sb.append(separator);
|
|
|
- sb.append("<font color='blue'>");
|
|
|
+ if ("\r".equals(lexeme.getText()) || "\n".equals(lexeme.getText())) {
|
|
|
sb.append(lexeme.getText());
|
|
|
- sb.append("</font>");
|
|
|
+ new_line = true;
|
|
|
} else {
|
|
|
- sb.append(separator);
|
|
|
- sb.append(lexeme.getText());
|
|
|
+ if (!new_line) {
|
|
|
+ sb.append(separator);
|
|
|
+ }
|
|
|
+ if (lexeme.getProperty() != null && !"99".equals(lexeme.getProperty())) {
|
|
|
+ sb.append("<font color='blue'>");
|
|
|
+ sb.append(lexeme.getText());
|
|
|
+ sb.append("</font>");
|
|
|
+ } else {
|
|
|
+ sb.append(lexeme.getText());
|
|
|
+ }
|
|
|
+ new_line = false;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- return sb.toString();
|
|
|
+ String s = sb.toString();
|
|
|
+ s = s.replaceAll("\r\n", "</br>");
|
|
|
+ return s;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void main(String args[]) {
|
|
|
+ String year_pattern = "([1-2][0-9]{3}|[0-9]{2})";
|
|
|
+ String mouth_day_pattern = "([0-9]{2}|[0-9])";
|
|
|
+ String join_pattern = "([-/.]?)";
|
|
|
+
|
|
|
+ String pattern_string = year_pattern + join_pattern + mouth_day_pattern + join_pattern + mouth_day_pattern;
|
|
|
+
|
|
|
+ System.out.println(Pattern.matches(pattern_string,"12.434"));
|
|
|
+ try {
|
|
|
+ ParticipleUtil util = new ParticipleUtil();
|
|
|
+ String content = "2017-01,9毫克7斤重量015年6月23日出现";
|
|
|
+ LexemePath<Lexeme> lexemes = util.participle(content);
|
|
|
+
|
|
|
+ lexemes = util.combineValidate(lexemes);
|
|
|
+
|
|
|
+ lexemes = util.joinTime(lexemes);
|
|
|
+ for (Lexeme l : lexemes) {
|
|
|
+ System.out.println(l.getText() + " | ");
|
|
|
+ if (l.getProperty().equals(Constants.word_property_number)) {
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private LexemePath<Lexeme> combineValidate(LexemePath<Lexeme> lexemes) {
|
|
|
+ Lexeme l = null;
|
|
|
+ LexemePath<Lexeme> results = new LexemePath<>();
|
|
|
+ for (int i = 0, len = lexemes.size(); i < len; i++) {
|
|
|
+ l = lexemes.get(i);
|
|
|
+ if (l.getProperty() != null
|
|
|
+ && (l.getProperty().equals(Constants.word_property_time) || l.getProperty().equals(Constants.word_property_unit))) {
|
|
|
+ findLast(lexemes, i, l, results);
|
|
|
+ } else {
|
|
|
+ results.add(l);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return results;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void findLast(LexemePath<Lexeme> lexemes, int index, Lexeme lexeme, LexemePath<Lexeme> results) {
|
|
|
+ Lexeme last_l = null;
|
|
|
+ if (index > 0) {
|
|
|
+ index--;
|
|
|
+ last_l = lexemes.get(index);
|
|
|
+ if (NlpUtil.isNumberString(last_l)) {
|
|
|
+ lexeme.setOffset(last_l.getOffset());
|
|
|
+ lexeme.setLength(last_l.getLength() + lexeme.getLength());
|
|
|
+ lexeme.setText(last_l.getText() + lexeme.getText());
|
|
|
+ results.remove(results.size() - 1);
|
|
|
+ }
|
|
|
+ results.add(lexeme);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private LexemePath<Lexeme> joinTime(LexemePath<Lexeme> lexemes) {
|
|
|
+ Lexeme l = null;
|
|
|
+ Lexeme next_l = null;
|
|
|
+ LexemePath<Lexeme> results = new LexemePath<>();
|
|
|
+ int cursor = 1;
|
|
|
+ for (int i = 0, len = lexemes.size(); i < len; i++) {
|
|
|
+ l = lexemes.get(i);
|
|
|
+ if (l.getProperty() != null
|
|
|
+ && (l.getProperty().equals(Constants.word_property_time))) {
|
|
|
+ while (i + cursor < len) {
|
|
|
+ next_l = lexemes.get(i + cursor);
|
|
|
+ if (next_l.getProperty() != null
|
|
|
+ && (next_l.getProperty().equals(Constants.word_property_time))) {
|
|
|
+ l.setText(l.getText() + next_l.getText());
|
|
|
+ l.setLength(l.getLength() + next_l.getLength());
|
|
|
+ cursor++;
|
|
|
+ } else {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (cursor > 1) {
|
|
|
+ i = i + cursor - 1;
|
|
|
+ cursor = 1;
|
|
|
+ }
|
|
|
+ results.add(l);
|
|
|
+ }
|
|
|
+ return results;
|
|
|
}
|
|
|
}
|