|
@@ -4,10 +4,9 @@ import java.io.BufferedReader;
|
|
import java.io.FileNotFoundException;
|
|
import java.io.FileNotFoundException;
|
|
import java.io.FileReader;
|
|
import java.io.FileReader;
|
|
import java.io.IOException;
|
|
import java.io.IOException;
|
|
-import java.util.HashMap;
|
|
|
|
-import java.util.List;
|
|
|
|
-import java.util.Map;
|
|
|
|
-import java.util.Set;
|
|
|
|
|
|
+import java.util.*;
|
|
|
|
+
|
|
|
|
+import org.algorithm.core.cnn.model.LemmaInfo;
|
|
import org.algorithm.util.TextFileReader;
|
|
import org.algorithm.util.TextFileReader;
|
|
import com.alibaba.fastjson.JSON;
|
|
import com.alibaba.fastjson.JSON;
|
|
import com.alibaba.fastjson.JSONObject;
|
|
import com.alibaba.fastjson.JSONObject;
|
|
@@ -15,45 +14,46 @@ import com.alibaba.fastjson.JSONObject;
|
|
/**
|
|
/**
|
|
* @Author: bijl
|
|
* @Author: bijl
|
|
* @Date: 2019年1月21日-下午2:43:44
|
|
* @Date: 2019年1月21日-下午2:43:44
|
|
- * @Description:
|
|
|
|
|
|
+ * @Description:
|
|
*/
|
|
*/
|
|
public class RelationExtractionDataSet {
|
|
public class RelationExtractionDataSet {
|
|
-
|
|
|
|
|
|
+
|
|
private Map<String, Integer> char2id = new HashMap<>();
|
|
private Map<String, Integer> char2id = new HashMap<>();
|
|
- private int maxLength = 200;
|
|
|
|
-
|
|
|
|
-
|
|
|
|
|
|
+ private Map<Integer, Map<String, String>> entities_info = new HashMap<>();
|
|
|
|
+ public int maxLength = 200;
|
|
|
|
+
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* 切分句子
|
|
* 切分句子
|
|
|
|
+ *
|
|
* @param document 原有文档
|
|
* @param document 原有文档
|
|
* @return 句子数组
|
|
* @return 句子数组
|
|
*/
|
|
*/
|
|
public String[] splitSentence(String document) {
|
|
public String[] splitSentence(String document) {
|
|
String[] sentences = null;
|
|
String[] sentences = null;
|
|
- sentences = document.split("。|;|\n|\n\r");
|
|
|
|
|
|
+ sentences = document.split("。|;|\n|\n\r");
|
|
return sentences;
|
|
return sentences;
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* 加载字典
|
|
* 加载字典
|
|
|
|
+ *
|
|
* @param dir
|
|
* @param dir
|
|
*/
|
|
*/
|
|
public void loadDictionary(String dir) {
|
|
public void loadDictionary(String dir) {
|
|
- List<String> lines= TextFileReader.readLines(dir);
|
|
|
|
BufferedReader br = null;
|
|
BufferedReader br = null;
|
|
try {
|
|
try {
|
|
- br = new BufferedReader(new FileReader(dir));// 读取原始json文件
|
|
|
|
|
|
+ br = new BufferedReader(new FileReader(dir));// 读取原始json文件
|
|
String s = null;
|
|
String s = null;
|
|
while ((s = br.readLine()) != null) {
|
|
while ((s = br.readLine()) != null) {
|
|
JSONObject jsonObject = (JSONObject) JSON.parse(s);
|
|
JSONObject jsonObject = (JSONObject) JSON.parse(s);
|
|
Set<Map.Entry<String, Object>> entries = jsonObject.entrySet();
|
|
Set<Map.Entry<String, Object>> entries = jsonObject.entrySet();
|
|
- for (Map.Entry<String, Object> entry : entries) {
|
|
|
|
|
|
+ for (Map.Entry<String, Object> entry : entries)
|
|
this.char2id.put(entry.getKey(), (Integer) entry.getValue());
|
|
this.char2id.put(entry.getKey(), (Integer) entry.getValue());
|
|
- }
|
|
|
|
}
|
|
}
|
|
} catch (Exception e) {
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
e.printStackTrace();
|
|
- }finally {
|
|
|
|
|
|
+ } finally {
|
|
try {
|
|
try {
|
|
br.close();
|
|
br.close();
|
|
} catch (IOException e) {
|
|
} catch (IOException e) {
|
|
@@ -61,51 +61,49 @@ public class RelationExtractionDataSet {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* 句子转字符ids
|
|
* 句子转字符ids
|
|
|
|
+ *
|
|
* @param sentence 句子
|
|
* @param sentence 句子
|
|
* @return ids
|
|
* @return ids
|
|
*/
|
|
*/
|
|
- public int[] sentence2ids(String sentence) {
|
|
|
|
- int[] ids = new int[this.maxLength];
|
|
|
|
|
|
+ public float[] sentence2ids(String sentence) {
|
|
|
|
+ float[] ids = new float[this.maxLength];
|
|
char ch = '1';
|
|
char ch = '1';
|
|
Integer id = null;
|
|
Integer id = null;
|
|
for (int i = 0; i < sentence.length(); i++) {
|
|
for (int i = 0; i < sentence.length(); i++) {
|
|
ch = sentence.charAt(i);
|
|
ch = sentence.charAt(i);
|
|
- id = this.char2id.get(ch);
|
|
|
|
|
|
+ id = this.char2id.get(String.valueOf(ch));
|
|
if (id == null) {
|
|
if (id == null) {
|
|
id = this.char2id.get("<UKC>");
|
|
id = this.char2id.get("<UKC>");
|
|
}
|
|
}
|
|
ids[i] = id.intValue();
|
|
ids[i] = id.intValue();
|
|
}
|
|
}
|
|
- for(int i=sentence.length(); i<this.maxLength; i++) // padding
|
|
|
|
|
|
+ for (int i = sentence.length(); i < this.maxLength; i++) // padding
|
|
ids[i] = this.char2id.get("<PAD>");
|
|
ids[i] = this.char2id.get("<PAD>");
|
|
-
|
|
|
|
|
|
+
|
|
return ids;
|
|
return ids;
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+
|
|
/**
|
|
/**
|
|
- *
|
|
|
|
* @param sentence 句子
|
|
* @param sentence 句子
|
|
* @param position 一个实体的位置
|
|
* @param position 一个实体的位置
|
|
* @return 句子中各个汉子相对于实体的位置
|
|
* @return 句子中各个汉子相对于实体的位置
|
|
*/
|
|
*/
|
|
- public int[] getRelativePositions(String sentence, String position) {
|
|
|
|
- int[] relativePositions = new int[this.maxLength];
|
|
|
|
|
|
+ public float[] getRelativePositions(String sentence, String position) {
|
|
|
|
+ float[] relativePositions = new float[this.maxLength];
|
|
String[] positionPair = position.split(",");
|
|
String[] positionPair = position.split(",");
|
|
- int startPos = Integer.parseInt(positionPair[0]);
|
|
|
|
|
|
+ int startPos = Integer.parseInt(positionPair[0]);
|
|
int endtPos = Integer.parseInt(positionPair[1]);
|
|
int endtPos = Integer.parseInt(positionPair[1]);
|
|
-
|
|
|
|
- char ch = '1';
|
|
|
|
- Integer id = null;
|
|
|
|
|
|
+
|
|
for (int i = 0; i < sentence.length(); i++) {
|
|
for (int i = 0; i < sentence.length(); i++) {
|
|
if (i < startPos)
|
|
if (i < startPos)
|
|
relativePositions[i] = startPos - i;
|
|
relativePositions[i] = startPos - i;
|
|
else if (i >= startPos && i <= endtPos)
|
|
else if (i >= startPos && i <= endtPos)
|
|
relativePositions[i] = 0;
|
|
relativePositions[i] = 0;
|
|
else
|
|
else
|
|
- relativePositions[i] = endtPos - i;
|
|
|
|
|
|
+ relativePositions[i] = i - endtPos;
|
|
}
|
|
}
|
|
|
|
|
|
for (int i = sentence.length(); i < this.maxLength; i++)
|
|
for (int i = sentence.length(); i < this.maxLength; i++)
|
|
@@ -114,6 +112,56 @@ public class RelationExtractionDataSet {
|
|
return relativePositions;
|
|
return relativePositions;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
+ * 获取实体对的组合
|
|
|
|
+ *
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public List<String> getPositionCombinations(String json_content) {
|
|
|
|
+ List<String> combinations = new ArrayList<>();
|
|
|
|
+
|
|
|
|
+ return combinations;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * @param sentence 输入句子
|
|
|
|
+ * @param json_content 句子content中的实体信息
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public List<float[][]> get_examples(String sentence, String json_content) {
|
|
|
|
+ List<float[][]> examples = new ArrayList<>();
|
|
|
|
+ List<String> combinations = this.getPositionCombinations(json_content);
|
|
|
|
+ float[] charId = this.sentence2ids(sentence);
|
|
|
|
+ for (String combination : combinations) {
|
|
|
|
+ float[][] example = new float[3][this.maxLength];
|
|
|
|
+ example[0] = charId;
|
|
|
|
+ example[1] = this.getRelativePositions(sentence, combination);
|
|
|
|
+ example[1] = this.getRelativePositions(sentence, combination);
|
|
|
|
+ examples.add(example);
|
|
|
|
+ }
|
|
|
|
+ return examples;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * @param sentence 输入句子
|
|
|
|
+ * @param entity1 实体1信息
|
|
|
|
+ * @param entity2 实体2信息
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public float[][] getExample(String sentence, LemmaInfo entity1, LemmaInfo entity2) {
|
|
|
|
+ float[][] example = new float[3][this.maxLength];
|
|
|
|
+ int startPos = entity1.getOffset().intValue();
|
|
|
|
+ int endPos = entity1.getOffset().intValue() + entity1.getLength().intValue() - 1;
|
|
|
|
+
|
|
|
|
+ example[0] = this.sentence2ids(sentence);
|
|
|
|
+ example[1] = this.getRelativePositions(sentence, startPos + "," + endPos);
|
|
|
|
+ startPos = entity2.getOffset().intValue();
|
|
|
|
+ endPos = entity2.getOffset().intValue() + entity2.getLength().intValue() - 1;
|
|
|
|
+ example[2] = this.getRelativePositions(sentence, startPos + "," + endPos);
|
|
|
|
+
|
|
|
|
+ return example;
|
|
|
|
+ }
|
|
|
|
|
|
|
|
|
|
}
|
|
}
|