Parcourir la source

1- 添加再分词模块。

bijl il y a 5 ans
Parent
commit
df2aaaffbf

+ 7 - 0
algorithm/src/main/java/org/algorithm/core/neural/dataset/NNDataSet.java

@@ -21,6 +21,7 @@ public abstract class NNDataSet {
     
     protected final Map<String, Integer> LABEL_DICT = new HashMap<>();
     protected final Map<String, Integer> NEGATIVE_DICT = new HashMap<>();
+    protected final Map<String, String>  RE_SPLIT_WORD_DICT = new HashMap<>();
     private final String[] FEATURE_DICT_ARRAY;
     private final String[] LABEL_DICT_ARRAY;
 
@@ -32,6 +33,7 @@ public abstract class NNDataSet {
         this.FEATURE_DICT_ARRAY = new String[this.NUM_FEATURE];
         this.LABEL_DICT_ARRAY = new String[this.NUM_LABEL];
         this.makeDictArr();
+        this.readReSplitWordDict();
     }
     
     /**
@@ -45,6 +47,11 @@ public abstract class NNDataSet {
      * 读取特征和类别字典
      */
     public abstract void readDict(String modelAndVersion);
+
+    /**
+     * 读取再分词字典
+     */
+    public abstract void readReSplitWordDict();
     
     /**
      * 生成字典列表

+ 73 - 72
algorithm/src/main/java/org/algorithm/core/neural/dataset/NNDataSetImpl.java

@@ -3,6 +3,7 @@ package org.algorithm.core.neural.dataset;
 import org.algorithm.util.TextFileReader;
 import org.diagbot.pub.utils.PropertiesUtil;
 
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -10,7 +11,7 @@ import java.util.Map.Entry;
 
 /**
  * 门诊诊断推送用数据集
- * 
+ *
  * @Author: bijl
  * @Date: 2018年7月26日-上午10:19:43
  * @Description:
@@ -25,6 +26,8 @@ public class NNDataSetImpl extends NNDataSet {
 
     @Override
     public float[] toFeatureVector(Map<String, Map<String, String>> inputs) {
+
+        this.reSplitWord(inputs);  // 再分词
         float[] featureVector = new float[this.NUM_FEATURE];
 
         Iterator<Entry<String, Map<String, String>>> entries = inputs.entrySet().iterator();
@@ -32,13 +35,9 @@ public class NNDataSetImpl extends NNDataSet {
         String featureName = "";
         Integer position = -1;
         Integer negative = 0;
-        // Integer partbodyValue = 0;
         float positive_value = 1.0f;
         float negative_value = -1.0f;
         Map<String, String> featureValues = null;
-        // String partbody = null;
-        // String[] partbodys = null;
-        // String sn = null;
 
         /**
          * 数据方案设计
@@ -51,11 +50,6 @@ public class NNDataSetImpl extends NNDataSet {
             featureValues = entry.getValue();
             position = this.FEATURE_DICT.get(featureName);
             negative = NEGATIVE_DICT.get(featureValues.get("negative"));
-            // 突出主症状的数据方案
-            // sn = featureValues.get("sn");
-            // if("0".equals(sn)) {
-            // negative = negative * 10;
-            // }
 
             if (position != null)
                 if (negative == 1)
@@ -65,91 +59,36 @@ public class NNDataSetImpl extends NNDataSet {
                 else
                     System.out.println("New Nagetive! This may lead to an error.");
 
-
-
-            /**
-             * 部位附属症状数据表示方案 partbodyValue = this.PARTBODY_DICT.get(featureValues.get("partbody"));
-             * if(partbodyValue != null) { value = 1.0f * partbodyValue /
-             * this.PARTBODY_DICT.get("NULL"); // 部位值表示 value = (float)(Math.round(value *
-             * 100000))/100000; // 保留5位有效数字 } value = negative * value; featureVector[position] =
-             * value;
-             * 
-             */
-
         }
 
         return featureVector;
     }
 
-
-    /**
-     * 读取字典
-     */
-//     @Override
-//     public void readDict(String modelAndVersion) {
-//    
-//     PropertiesUtil prop = new PropertiesUtil("/algorithm.properties");
-//     String model_version = prop.getProperty(modelAndVersion);
-//     model_version = model_version.trim();
-//    
-//     String url = "jdbc:mysql://192.168.2.235/diagbot-app?user=root&password=diagbot@20180822";
-//     MysqlConnector connector = new MysqlConnector(url);
-//     String querySql = "SELECT md._name, md._index, md.type_id " + "FROM model_dictionary AS md "
-//     + "WHERE md.belong_model = 'outpatient_model'";
-//    
-//     querySql = querySql.replace("outpatient_model", model_version);
-//     ResultSet rs = connector.query(querySql);
-//     try {
-//     while (rs.next()) {
-//     int type_id = rs.getInt("type_id");
-//     int _index = rs.getInt("_index");
-//     String _name = rs.getString("_name");
-//    
-//     if (type_id == 1)
-//     this.FEATURE_DICT.put(_name, _index);
-//     else if (type_id == 2)
-//     this.LABEL_DICT.put(_name, _index);
-//     else if (type_id == 8)
-//     this.NEGATIVE_DICT.put(_name, _index);
-//    
-//     }
-//    
-//     System.out.println("feature size:"+this.FEATURE_DICT.size());
-//    
-//     } catch (SQLException e) {
-//     e.printStackTrace();
-//     throw new RuntimeException("加载特征和类别字典失败");
-//     } finally {
-//     connector.close();
-//     }
-//    
-//     }
-
     @Override
     public void readDict(String modelAndVersion) {
-        
+
         PropertiesUtil prop = new PropertiesUtil("/algorithm.properties");
         String model_version = prop.getProperty(modelAndVersion);
 
         String filePath = prop.getProperty("basicPath");  // 基本目录
         filePath = filePath.substring(0, filePath.indexOf("model_version_replacement"));
-        
+
         filePath = filePath + "dictionaries.bin";  // 字典文件位置
-        
+
         List<String> lines = TextFileReader.readLines(filePath);
 
         boolean firstLine = true;
-        
+
         String[] temp = null;
         for (String line : lines) {
             if (firstLine) {  // 去除第一行
                 firstLine = false;
                 continue;
             }
-            
+
             temp = line.split("\\|");
-            
-            if(temp[3].equals(model_version)){
+
+            if (temp[3].equals(model_version)) {
                 int type_id = Integer.parseInt(temp[2]);
                 int _index = Integer.parseInt(temp[1]);
                 String _name = temp[0];
@@ -168,4 +107,66 @@ public class NNDataSetImpl extends NNDataSet {
 
     }
 
+    /**
+     * 再分词:
+     * 基本操作:
+     * 如果再分词表中有某一词项,则移除它,并添加该此项对应的细分词项
+     *
+     * @param inputs 输入
+     */
+    public void reSplitWord(Map<String, Map<String, String>> inputs) {
+        Iterator<Entry<String, Map<String, String>>> entries = inputs.entrySet().iterator();
+
+        String featureName = "";
+        String[] splitWords = null;
+        Integer negative = 1;
+        Map<String, String> featureValues = null;
+        while (entries.hasNext()) {
+            Entry<String, Map<String, String>> entry = entries.next();
+            featureName = entry.getKey();
+            if (this.RE_SPLIT_WORD_DICT.get(featureName) != null) {
+                entries.remove();  // 移除该词项
+                splitWords = this.RE_SPLIT_WORD_DICT.get(featureName).split("_");
+                for (String word : splitWords) {  // 添加细分词项
+                    featureValues = new HashMap<>();
+                    featureValues.put("negative", "有"); // 设置为阳性词
+                    inputs.put(word, featureValues);
+                }
+
+            }
+
+        }
+    }
+
+    @Override
+    public void readReSplitWordDict() {
+        PropertiesUtil prop = new PropertiesUtil("/algorithm.properties");
+        String filePath = prop.getProperty("basicPath");  // 基本目录
+        filePath = filePath.substring(0, filePath.indexOf("model_version_replacement"));
+
+        filePath = filePath + "dictionaries.bin";  // 字典文件位置
+
+        List<String> lines = TextFileReader.readLines(filePath);
+
+        boolean firstLine = true;
+
+        String[] temp = null;
+        Map<String, String> feature_map = null;
+        for (String line : lines) {
+            if (firstLine) {  // 去除第一行
+                firstLine = false;
+                continue;
+            }
+
+            temp = line.split("\\|");
+
+            this.RE_SPLIT_WORD_DICT.put(temp[0], temp[1]);
+
+        }
+
+        System.out.println("再分词,词条数:" + this.RE_SPLIT_WORD_DICT.size());
+
+    }
+
+
 }