|
@@ -0,0 +1,404 @@
|
|
|
+package org.diagbot.nlp.test;
|
|
|
+
|
|
|
+import org.diagbot.nlp.participle.ParticipleUtil;
|
|
|
+import org.diagbot.nlp.participle.cfg.Configuration;
|
|
|
+import org.diagbot.nlp.participle.cfg.DefaultConfig;
|
|
|
+import org.diagbot.nlp.participle.word.Lexeme;
|
|
|
+import org.diagbot.nlp.participle.word.LexemePath;
|
|
|
+import org.diagbot.nlp.participle.word.Segment;
|
|
|
+import org.diagbot.nlp.util.Constants;
|
|
|
+import org.diagbot.nlp.util.NegativeEnum;
|
|
|
+import org.diagbot.pub.jdbc.MysqlJdbc;
|
|
|
+
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.HashMap;
|
|
|
+import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
+
|
|
|
+public class EntityExtractTest {
|
|
|
+ public static void main(String[] args) {
|
|
|
+
|
|
|
+ Configuration configuration = new DefaultConfig();
|
|
|
+ Segment segment = configuration.loadMainDict("tc.dict");
|
|
|
+
|
|
|
+ EntityExtractTest entityExtractTest = new EntityExtractTest();
|
|
|
+ List<Map<String, String>> data = entityExtractTest.searchData();
|
|
|
+
|
|
|
+ Map<String, List<String>> pairList = entityExtractTest.searchPropertyPair();
|
|
|
+
|
|
|
+ Map<String, String> lexiconTypeMap = entityExtractTest.searchLexiconType();
|
|
|
+
|
|
|
+ String present = "";
|
|
|
+ String[] partPresents;
|
|
|
+ String sentenceId = "";
|
|
|
+ LexemePath<Lexeme> lexemes = null;
|
|
|
+ List<Map<String, Object>> results_pair = new ArrayList<>();
|
|
|
+ List<Map<String, Object>> results_none_pair = new ArrayList<>();
|
|
|
+ try {
|
|
|
+ for (int i = 0; i < data.size(); i++) {
|
|
|
+ present = data.get(i).get("xbs");
|
|
|
+ present = present.replaceAll("\r\n", "");
|
|
|
+ sentenceId = data.get(i).get("zyxh");
|
|
|
+ partPresents = present.split(";|;|。");
|
|
|
+ for (int k = 0; k < partPresents.length; k++) {
|
|
|
+ if (partPresents[k].length() == 0) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ lexemes = ParticipleUtil.participle(partPresents[k], segment);
|
|
|
+
|
|
|
+ lexemes = entityExtractTest.combineValidate(lexemes);
|
|
|
+ for (int l_1 = 0; l_1 < lexemes.size(); l_1++) {
|
|
|
+ Lexeme lexeme_1 = lexemes.get(l_1);
|
|
|
+ if ("99".equals(lexeme_1.getProperty())) {
|
|
|
+ continue;
|
|
|
+ } else {
|
|
|
+ for (int l_2 = l_1 + 1; l_2 < lexemes.size(); l_2++) {
|
|
|
+ Lexeme lexeme_2 = lexemes.get(l_2);
|
|
|
+ if (!"99".equals(lexeme_2.getProperty())) {
|
|
|
+ boolean isPair = entityExtractTest.isPair(pairList, lexeme_1.getProperty(), lexeme_2.getProperty());
|
|
|
+ Map<String, Object> result = new HashMap<>();
|
|
|
+ result.put("sentence_id", sentenceId);
|
|
|
+ result.put("sentence_uuid", sentenceId + "_" + k);
|
|
|
+ result.put("sentence", partPresents[k]);
|
|
|
+ result.put("entity_1_name", lexeme_1.getText());
|
|
|
+ result.put("entity_1_position", lexeme_1.getOffset() + "," + (lexeme_1.getOffset() + lexeme_1.getLength() - 1));
|
|
|
+ result.put("entity_1_prop", lexeme_1.getProperty());
|
|
|
+ result.put("entity_1_prop_name", entityExtractTest.propId2Name(lexiconTypeMap, lexeme_1.getProperty()));
|
|
|
+ result.put("entity_2_name", lexeme_2.getText());
|
|
|
+ result.put("entity_2_position", lexeme_2.getOffset() + "," + (lexeme_2.getOffset() + lexeme_2.getLength() - 1));
|
|
|
+ result.put("entity_2_prop", lexeme_2.getProperty());
|
|
|
+ result.put("entity_2_prop_name", entityExtractTest.propId2Name(lexiconTypeMap, lexeme_2.getProperty()));
|
|
|
+ result.put("relation", "0");
|
|
|
+ if (isPair) {
|
|
|
+ results_pair.add(result);
|
|
|
+ } else {
|
|
|
+ results_none_pair.add(result);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (results_pair.size() > 0) {
|
|
|
+ MysqlJdbc nlpJdbc = new MysqlJdbc("root", "diagbot@20180822", "jdbc:mysql://192.168.2.235:3306/nlp-web?useUnicode=true&characterEncoding=UTF-8");
|
|
|
+ nlpJdbc.insert(results_pair, "re_tagging_result_part_new", new String[]{"sentence_id", "sentence_uuid", "sentence", "entity_1_position", "entity_2_position",
|
|
|
+ "entity_1_name", "entity_2_name", "entity_1_prop", "entity_2_prop", "entity_1_prop_name", "entity_2_prop_name", "relation"});
|
|
|
+ }
|
|
|
+
|
|
|
+ if (results_none_pair.size() > 0) {
|
|
|
+ MysqlJdbc nlpJdbc = new MysqlJdbc("root", "diagbot@20180822", "jdbc:mysql://192.168.2.235:3306/nlp-web?useUnicode=true&characterEncoding=UTF-8");
|
|
|
+ nlpJdbc.insert(results_none_pair, "re_tagging_result_part_new_none", new String[]{"sentence_id", "sentence_uuid", "sentence", "entity_1_position", "entity_2_position",
|
|
|
+ "entity_1_name", "entity_2_name", "entity_1_prop", "entity_2_prop", "entity_1_prop_name", "entity_2_prop_name", "relation"});
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public void participleReload() {
|
|
|
+ Configuration configuration = new DefaultConfig();
|
|
|
+ Segment segment = configuration.loadMainDict("tc.dict");
|
|
|
+
|
|
|
+ EntityExtractTest entityExtractTest = new EntityExtractTest();
|
|
|
+ List<Map<String, String>> data = entityExtractTest.searchData();
|
|
|
+
|
|
|
+ Map<String, List<String>> pairList = entityExtractTest.searchPropertyPair();
|
|
|
+
|
|
|
+ Map<String, String> lexiconTypeMap = entityExtractTest.searchLexiconType();
|
|
|
+
|
|
|
+ String present = "";
|
|
|
+ String[] partPresents;
|
|
|
+ String sentenceId = "";
|
|
|
+ LexemePath<Lexeme> lexemes = null;
|
|
|
+ try {
|
|
|
+ for (int i = 0; i < data.size(); i++) {
|
|
|
+ present = data.get(i).get("xbs");
|
|
|
+ present = present.replaceAll("\r\n", "");
|
|
|
+ sentenceId = data.get(i).get("zyxh");
|
|
|
+ partPresents = present.split(";|;|。");
|
|
|
+ for (int k = 0; k < partPresents.length; k++) {
|
|
|
+ if (partPresents[k].length() == 0) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Map<String, String>> searchData() {
|
|
|
+ MysqlJdbc nlpJdbc = new MysqlJdbc("root", "diagbot@20180822", "jdbc:mysql://192.168.2.235:3306/nlp-web?useUnicode=true&characterEncoding=UTF-8");
|
|
|
+ List<Map<String, String>> data = nlpJdbc.query("tb_ryjl_extract", new String[]{"zyxh", "xbs"}, " limit 15, 100");
|
|
|
+ return data;
|
|
|
+ }
|
|
|
+
|
|
|
+ public Map<String, String> searchLexiconType() {
|
|
|
+ MysqlJdbc nlpJdbc = new MysqlJdbc("root", "diagbot@20180822", "jdbc:mysql://192.168.2.235:3306/nlp-web?useUnicode=true&characterEncoding=UTF-8");
|
|
|
+ List<Map<String, String>> data = nlpJdbc.query("re_lexicon_type", new String[]{"id", "name"}, " limit 0, 1000");
|
|
|
+
|
|
|
+ Map<String, String> result = new HashMap<>();
|
|
|
+ for (Map<String, String> map : data) {
|
|
|
+ result.put(map.get("id"), map.get("name"));
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<Map<String, String>> searchTaggingResult() {
|
|
|
+ MysqlJdbc nlpJdbc = new MysqlJdbc("root", "diagbot@20180822", "jdbc:mysql://192.168.2.235:3306/nlp-web?useUnicode=true&characterEncoding=UTF-8");
|
|
|
+ List<Map<String, String>> data = nlpJdbc.query("re_tagging_result_part", new String[]{"sentence_id", "sentence_uuid", "sentence", "entity_1_name", "entity_2_name", "entity_1_position", "entity_2_position"}, " order by sentence_id, sentence_uuid");
|
|
|
+
|
|
|
+ Map<String, List<Map<String, List<Map<String, String>>>>> taggingResults = new HashMap<>();
|
|
|
+
|
|
|
+ String sentence_id = "";
|
|
|
+ String sentence_uuid = "";
|
|
|
+
|
|
|
+ List<Map<String, List<Map<String, String>>>> sentence_id_list = null;
|
|
|
+ Map<String, List<Map<String, String>>> sentence_id_map = null;
|
|
|
+ List<Map<String, String>> sentence_uuid_list = null;
|
|
|
+ for (Map<String, String> map : data) {
|
|
|
+ sentence_id = map.get("sentence_id");
|
|
|
+ sentence_uuid = map.get("sentence_uuid");
|
|
|
+ if (taggingResults.get(sentence_id) == null) {
|
|
|
+ sentence_id_list = new ArrayList<>();
|
|
|
+ sentence_uuid_list = new ArrayList<>();
|
|
|
+ sentence_uuid_list.add(map);
|
|
|
+
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return data;
|
|
|
+ }
|
|
|
+
|
|
|
+ public Map<String, List<String>> searchPropertyPair() {
|
|
|
+ MysqlJdbc nlpJdbc = new MysqlJdbc("root", "diagbot@20180822", "jdbc:mysql://192.168.2.235:3306/nlp-web?useUnicode=true&characterEncoding=UTF-8");
|
|
|
+ List<Map<String, String>> data = nlpJdbc.query("re_lexicon_property_pair", new String[]{"prop1_id", "prop2_id"}, " where has_relation = 1");
|
|
|
+
|
|
|
+ Map<String, List<String>> result = new HashMap<>();
|
|
|
+ List<String> list = null;
|
|
|
+ for (Map<String, String> map : data) {
|
|
|
+ String prop1_id = map.get("prop1_id");
|
|
|
+ String prop2_id = map.get("prop2_id");
|
|
|
+
|
|
|
+ list = result.get(prop1_id);
|
|
|
+ if (list == null) {
|
|
|
+ list = new ArrayList<>();
|
|
|
+ list.add(prop2_id);
|
|
|
+ result.put(prop1_id, list);
|
|
|
+ } else {
|
|
|
+ if (!list.contains(prop2_id)) {
|
|
|
+ list.add(prop2_id);
|
|
|
+ result.put(prop1_id, list);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ list = result.get(prop2_id);
|
|
|
+ if (list == null) {
|
|
|
+ list = new ArrayList<>();
|
|
|
+ list.add(prop1_id);
|
|
|
+ result.put(prop2_id, list);
|
|
|
+ } else {
|
|
|
+ if (!list.contains(prop1_id)) {
|
|
|
+ list.add(prop1_id);
|
|
|
+ result.put(prop2_id, list);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean isPair(Map<String, List<String>> result, String prop1_id, String prop2_id) {
|
|
|
+ String[] prop1_ids = prop1_id.split(",");
|
|
|
+ String[] prop2_ids = prop2_id.split(",");
|
|
|
+ List<String> list = null;
|
|
|
+ for (int i = 0; i < prop1_ids.length; i++) {
|
|
|
+ list = result.get(prop1_ids[i]);
|
|
|
+ if (list == null) continue;
|
|
|
+ for (int j = 0; j < prop2_ids.length; j++) {
|
|
|
+ if (list.contains(prop2_ids[j])) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String propId2Name(Map<String, String> map, String prop_id) {
|
|
|
+ String[] prop_ids = prop_id.split(",");
|
|
|
+
|
|
|
+ String prop_name = "";
|
|
|
+ for (int i = 0; i < prop_ids.length; i++) {
|
|
|
+ if (i == 0) {
|
|
|
+ prop_name = map.get(prop_ids[i]);
|
|
|
+ } else {
|
|
|
+ prop_name = prop_name + "," + map.get(prop_ids[i]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return prop_name;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String lexeme2Text(List<Lexeme> lexemes) {
|
|
|
+ String text = "";
|
|
|
+ for (Lexeme l : lexemes) {
|
|
|
+ text = text + l.getText();
|
|
|
+ }
|
|
|
+ return text;
|
|
|
+ }
|
|
|
+
|
|
|
+ private LexemePath<Lexeme> combineValidate(LexemePath<Lexeme> lexemes) {
|
|
|
+ Lexeme l = null;
|
|
|
+ LexemePath<Lexeme> results = new LexemePath<>();
|
|
|
+ for (int i = 0, len = lexemes.size(); i < len; i++) {
|
|
|
+ l = lexemes.get(i);
|
|
|
+ if (l.getProperty() != null
|
|
|
+ && (l.getProperty().equals(Constants.word_property_time) || l.getProperty().equals(Constants.word_property_unit))) {
|
|
|
+ findLast(lexemes, i, l, results);
|
|
|
+ } else {
|
|
|
+ results.add(l);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return results;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void findLast(LexemePath<Lexeme> lexemes, int index, Lexeme lexeme, LexemePath<Lexeme> results) {
|
|
|
+ Lexeme last_l = null;
|
|
|
+ if (index > 0) {
|
|
|
+ index--;
|
|
|
+ last_l = lexemes.get(index);
|
|
|
+ if ("×".equals(last_l.getText()) && index > 0) {
|
|
|
+ lexeme.setOffset(last_l.getOffset());
|
|
|
+ lexeme.setLength(last_l.getLength() + lexeme.getLength());
|
|
|
+ lexeme.setText(last_l.getText() + lexeme.getText());
|
|
|
+ results.remove(results.size() - 1);
|
|
|
+ index--;
|
|
|
+ last_l = lexemes.get(index);
|
|
|
+ }
|
|
|
+ if (isNumberString(last_l)) {
|
|
|
+ lexeme.setOffset(last_l.getOffset());
|
|
|
+ lexeme.setLength(last_l.getLength() + lexeme.getLength());
|
|
|
+ lexeme.setText(last_l.getText() + lexeme.getText());
|
|
|
+ results.remove(results.size() - 1);
|
|
|
+ }
|
|
|
+ results.add(lexeme);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public static boolean isNumberString(Lexeme l) {
|
|
|
+ if (l == null) return false;
|
|
|
+ if (isFeature(l.getProperty(), new NegativeEnum[]{NegativeEnum.NUMBER_QUANTIFIER})) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ for (char c : l.getText().toCharArray()) {
|
|
|
+ if (c >= '0' && c <= '9') {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static boolean isFeature(String property, NegativeEnum[] features) {
|
|
|
+ if (property == null) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+ if (features == null || features.length == 0) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ if (property.indexOf(",") > 0) {
|
|
|
+ String[] properties = property.split(",");
|
|
|
+ for (int i = 0; i < properties.length; i++) {
|
|
|
+ for (NegativeEnum nenum : features) {
|
|
|
+ if (NegativeEnum.parseOfValue(properties[i]) == nenum) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ for (NegativeEnum nenum : features) {
|
|
|
+ if (NegativeEnum.parseOfValue(property) == nenum) {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ class TaggingResult {
|
|
|
+ private String sentence_id;
|
|
|
+ private String sentence_uuid;
|
|
|
+ private String sentence;
|
|
|
+ private String entity_1_name;
|
|
|
+ private String entity_2_name;
|
|
|
+ private String entity_1_position;
|
|
|
+ private String entity_2_position;
|
|
|
+
|
|
|
+ List<TaggingResult> taggingResults = new ArrayList<>();
|
|
|
+
|
|
|
+ public String getSentence_id() {
|
|
|
+ return sentence_id;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setSentence_id(String sentence_id) {
|
|
|
+ this.sentence_id = sentence_id;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getSentence_uuid() {
|
|
|
+ return sentence_uuid;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setSentence_uuid(String sentence_uuid) {
|
|
|
+ this.sentence_uuid = sentence_uuid;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getSentence() {
|
|
|
+ return sentence;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setSentence(String sentence) {
|
|
|
+ this.sentence = sentence;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getEntity_1_name() {
|
|
|
+ return entity_1_name;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setEntity_1_name(String entity_1_name) {
|
|
|
+ this.entity_1_name = entity_1_name;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getEntity_2_name() {
|
|
|
+ return entity_2_name;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setEntity_2_name(String entity_2_name) {
|
|
|
+ this.entity_2_name = entity_2_name;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getEntity_1_position() {
|
|
|
+ return entity_1_position;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setEntity_1_position(String entity_1_position) {
|
|
|
+ this.entity_1_position = entity_1_position;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String getEntity_2_position() {
|
|
|
+ return entity_2_position;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setEntity_2_position(String entity_2_position) {
|
|
|
+ this.entity_2_position = entity_2_position;
|
|
|
+ }
|
|
|
+
|
|
|
+ public List<TaggingResult> getTaggingResults() {
|
|
|
+ return taggingResults;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setTaggingResults(List<TaggingResult> taggingResults) {
|
|
|
+ this.taggingResults = taggingResults;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+}
|