|
@@ -0,0 +1,282 @@
|
|
|
|
+package com.lantone.qc.kernel.structure.ai.model;
|
|
|
|
+
|
|
|
|
+import com.google.common.collect.Lists;
|
|
|
|
+import com.google.common.collect.Maps;
|
|
|
|
+import com.google.common.collect.Sets;
|
|
|
|
+
|
|
|
|
+import java.util.ArrayList;
|
|
|
|
+import java.util.List;
|
|
|
|
+import java.util.Map;
|
|
|
|
+import java.util.Set;
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
+ * @Description:
|
|
|
|
+ * @Author: HUJING
|
|
|
|
+ * @Date: 2020/8/2 12:25
|
|
|
|
+ */
|
|
|
|
+public class InvertedIndexTableBuilder {
|
|
|
|
+ private static Set<String> selectedLonelyEntityTypes; // 选定的孤独实体类型
|
|
|
|
+ private static Set<String> selectedRelationTypes; // 选定的关系类型
|
|
|
|
+ // 选定的多步关系模式,比如:身体部位->临床表现<-否定
|
|
|
|
+ private static Set<Set<String>> selectedMultiStepRelationTypes;
|
|
|
|
+
|
|
|
|
+ private Map<Integer, Lemma> idEntityMap;
|
|
|
|
+ private List<Relation> relations;
|
|
|
|
+ private List<Lemma> lonelyEntityList;
|
|
|
|
+
|
|
|
|
+ private static Set<String> removeIndexWords; // 被移除的搜索词
|
|
|
|
+
|
|
|
|
+ static { // 把该模块从现病史和专科检查扩展到别的比较中需要修改的地方
|
|
|
|
+ selectedLonelyEntityTypes = Sets.newHashSet("临床表现", "修饰");
|
|
|
|
+ selectedRelationTypes = Sets.newHashSet("身体部位-临床表现",
|
|
|
|
+ "一般情况-一般情况描述", "否定-临床表现");
|
|
|
|
+ selectedMultiStepRelationTypes = Sets.newHashSet(Sets.newHashSet()); // 暂且没用上
|
|
|
|
+
|
|
|
|
+ removeIndexWords = Sets.newHashSet();
|
|
|
|
+
|
|
|
|
+ for (int i = 0; i < 10; i++) {
|
|
|
|
+ removeIndexWords.add("" + i);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ for (int i = 0; i < 10; i++) { //数字
|
|
|
|
+ removeIndexWords.add(String.valueOf(i));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ for (int i = 'a'; i <= 'z'; i++) { //英文词
|
|
|
|
+ removeIndexWords.add(String.valueOf((char) i));
|
|
|
|
+ removeIndexWords.add(String.valueOf((char) i).toUpperCase());
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ List<String> punctuations = Lists.newArrayList();//添加标点符号
|
|
|
|
+ List<String> positionWords = Lists.newArrayList();//添加方位词
|
|
|
|
+ List<String> emrWords = Lists.newArrayList("无");//添加一些病例中无用的词
|
|
|
|
+
|
|
|
|
+ List<String> words = new ArrayList<>();
|
|
|
|
+ words.addAll(emrWords);
|
|
|
|
+ words.addAll(punctuations);
|
|
|
|
+ words.addAll(positionWords);
|
|
|
|
+
|
|
|
|
+ removeIndexWords.addAll(words);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public InvertedIndexTableBuilder(List<Lemma> lemmas, List<Relation> relations) {
|
|
|
|
+ idEntityMap = Maps.newHashMap();
|
|
|
|
+ for (Lemma lemma : lemmas) {
|
|
|
|
+ idEntityMap.put(lemma.getId(), lemma);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ this.relations = relations;
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 过滤实体和关系
|
|
|
|
+ */
|
|
|
|
+ private void filterEntitiesAndRelations() {
|
|
|
|
+
|
|
|
|
+ // 实体删除规则:没在关系中,且其实体类型没在已选的孤独实体类型中
|
|
|
|
+ Map<Integer, Lemma> newIdEntityMap = Maps.newHashMap();
|
|
|
|
+ idEntityMap.forEach((id, lemma) -> {
|
|
|
|
+ if (selectedLonelyEntityTypes.contains(lemma.getProperty())) {
|
|
|
|
+ newIdEntityMap.put(id, lemma);
|
|
|
|
+ }
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ List<Relation> filteredRelations = Lists.newArrayList();
|
|
|
|
+ Set<Integer> entityIdInRelationSet = Sets.newHashSet();
|
|
|
|
+ for (Relation relation : relations) {
|
|
|
|
+ Integer fromId = relation.getFrom();
|
|
|
|
+ Integer toId = relation.getTo();
|
|
|
|
+ if (selectedRelationTypes.contains(relation.getRelationName())) {
|
|
|
|
+ filteredRelations.add(relation);
|
|
|
|
+ newIdEntityMap.put(fromId, idEntityMap.get(fromId));
|
|
|
|
+ newIdEntityMap.put(toId, idEntityMap.get(toId));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ entityIdInRelationSet.add(fromId);
|
|
|
|
+ entityIdInRelationSet.add(toId);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ this.idEntityMap = newIdEntityMap;
|
|
|
|
+ this.relations = filteredRelations;
|
|
|
|
+ this.lonelyEntityList = Lists.newArrayList();
|
|
|
|
+ this.idEntityMap.forEach((id, lemma) -> {
|
|
|
|
+ if (!entityIdInRelationSet.contains(id)) { // 不在关系中的实体为孤独实体
|
|
|
|
+ this.lonelyEntityList.add(lemma);
|
|
|
|
+ }
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 分组有关系的实体id对
|
|
|
|
+ *
|
|
|
|
+ * @param idPairs 实体id对
|
|
|
|
+ * @return 实体id对组
|
|
|
|
+ */
|
|
|
|
+ public static List<List<Integer>> groupIdPairs(List<Integer[]> idPairs) {
|
|
|
|
+
|
|
|
|
+ Map<Integer, Set<Integer>> idRelatedIdsMap = Maps.newHashMap();
|
|
|
|
+ for (Integer[] idPair : idPairs) {
|
|
|
|
+ if (idPair.length == 2) {
|
|
|
|
+ Integer fromId = idPair[0];
|
|
|
|
+ Integer toId = idPair[1];
|
|
|
|
+ Set<Integer> fromIds = idRelatedIdsMap.getOrDefault(fromId, Sets.newHashSet());
|
|
|
|
+ Set<Integer> toIds = idRelatedIdsMap.getOrDefault(toId, Sets.newHashSet());
|
|
|
|
+ fromIds.addAll(toIds);
|
|
|
|
+ fromIds.add(fromId);
|
|
|
|
+ fromIds.add(toId);
|
|
|
|
+ fromIds.forEach(id -> { // 更新每个id所关联的ids
|
|
|
|
+ idRelatedIdsMap.put(id, fromIds);
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ List<List<Integer>> idGroups = Lists.newArrayList();
|
|
|
|
+ Set<Integer> usedIds = Sets.newHashSet();
|
|
|
|
+
|
|
|
|
+ idRelatedIdsMap.forEach((id, ids) -> {
|
|
|
|
+ if (!usedIds.contains(id)) {
|
|
|
|
+ List<Integer> idsList = Lists.newArrayList();
|
|
|
|
+ idsList.addAll(ids);
|
|
|
|
+ idsList.sort(Integer::compareTo);
|
|
|
|
+ idGroups.add(idsList);
|
|
|
|
+
|
|
|
|
+ usedIds.addAll(ids); // 避免重复
|
|
|
|
+ }
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ return idGroups;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 用关系合并实体
|
|
|
|
+ *
|
|
|
|
+ * @return 实体组列表
|
|
|
|
+ */
|
|
|
|
+ private List<List<Lemma>> mergeEntitiesByRelations() {
|
|
|
|
+
|
|
|
|
+ List<List<Lemma>> entityGroups = Lists.newArrayList();
|
|
|
|
+
|
|
|
|
+ // 孤独实体
|
|
|
|
+ for (Lemma lemma : lonelyEntityList) {
|
|
|
|
+ entityGroups.add(Lists.newArrayList(lemma));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ List<Integer[]> idPairs = Lists.newArrayList();
|
|
|
|
+ relations.forEach(relation -> {
|
|
|
|
+ idPairs.add(new Integer[] { relation.getFrom(), relation.getTo() });
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ List<List<Integer>> idGroups = groupIdPairs(idPairs);
|
|
|
|
+ idGroups.forEach(idGroup -> {
|
|
|
|
+ if (idGroup.size() == 2) {
|
|
|
|
+ List<Lemma> entityGroup = Lists.newArrayList();
|
|
|
|
+ idGroup.forEach(id -> {
|
|
|
|
+ entityGroup.add(idEntityMap.get(id));
|
|
|
|
+ });
|
|
|
|
+ entityGroups.add(entityGroup);
|
|
|
|
+ } else if (idGroup.size() == 3) {
|
|
|
|
+ // 将来会用一些模式去匹配,比如:(腹,压痛,反跳痛) 本质是=> 腹->压痛, 腹->反跳痛, 需要拆开成两组
|
|
|
|
+
|
|
|
|
+ List<Lemma> entityGroup = Lists.newArrayList();
|
|
|
|
+ idGroup.forEach(id -> {
|
|
|
|
+ entityGroup.add(idEntityMap.get(id));
|
|
|
|
+ });
|
|
|
|
+ entityGroups.add(entityGroup);
|
|
|
|
+ } else {
|
|
|
|
+ // 将来会用一些模式去配,这里先留下
|
|
|
|
+ }
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ return entityGroups;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 设置一个实体块对象
|
|
|
|
+ *
|
|
|
|
+ * @param entityGroup 实体组
|
|
|
|
+ * @param id 实体块id
|
|
|
|
+ * @return 实体块对象
|
|
|
|
+ */
|
|
|
|
+ private EntityBlock setEntityBlock(List<Lemma> entityGroup, Integer id) {
|
|
|
|
+
|
|
|
|
+ List<String> entityWords = Lists.newArrayList();
|
|
|
|
+ List<String> entityTypes = Lists.newArrayList();
|
|
|
|
+ List<int[]> positions = Lists.newArrayList();
|
|
|
|
+ entityGroup.forEach(lemma -> {
|
|
|
|
+ entityWords.add(lemma.getText());
|
|
|
|
+ entityTypes.add(lemma.getProperty());
|
|
|
|
+ positions.add(new int[] { lemma.getFrom(), lemma.getTo() });
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ StringBuilder searchWord = new StringBuilder();
|
|
|
|
+ for (String word : entityWords) {
|
|
|
|
+ searchWord.append(word);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ EntityBlock entityBlock = new EntityBlock();
|
|
|
|
+ entityBlock.setEntityWords(entityWords);
|
|
|
|
+ entityBlock.setEntityTypes(entityTypes);
|
|
|
|
+ entityBlock.setPositions(positions);
|
|
|
|
+ entityBlock.setSearchWord(searchWord.toString());
|
|
|
|
+ entityBlock.setId(id);
|
|
|
|
+
|
|
|
|
+ return entityBlock;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 设置所有实体块对象
|
|
|
|
+ *
|
|
|
|
+ * @param entityGroups 实体组列表
|
|
|
|
+ * @return 实体块对象列表
|
|
|
|
+ */
|
|
|
|
+ private List<EntityBlock> setEntityBlocks(List<List<Lemma>> entityGroups) {
|
|
|
|
+
|
|
|
|
+ List<EntityBlock> entityBlocks = Lists.newArrayList();
|
|
|
|
+ for (int i = 0; i < entityGroups.size(); i++) {
|
|
|
|
+ entityBlocks.add(setEntityBlock(entityGroups.get(i), i));
|
|
|
|
+ }
|
|
|
|
+ return entityBlocks;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 生成倒排索引表
|
|
|
|
+ *
|
|
|
|
+ * @param entityBlocks 实体块对象列表
|
|
|
|
+ * @return {字符:实体块对象集合},倒排索引表
|
|
|
|
+ */
|
|
|
|
+ private Map<String, Set<EntityBlock>> generateInvertedIndexTable(List<EntityBlock> entityBlocks) {
|
|
|
|
+
|
|
|
|
+ Map<String, Map<Integer, EntityBlock>> invertedIndexTableTemp = Maps.newHashMap(); // 防止重复
|
|
|
|
+ entityBlocks.forEach(entityBlock -> {
|
|
|
|
+ String searchWord = entityBlock.getSearchWord();
|
|
|
|
+ for (int i = 0; i < searchWord.length(); i++) {
|
|
|
|
+ String char_ = searchWord.substring(i, i + 1);
|
|
|
|
+ Map<Integer, EntityBlock> idEntityBlockMap = invertedIndexTableTemp.getOrDefault(char_, Maps.newHashMap());
|
|
|
|
+ idEntityBlockMap.put(entityBlock.getId(), entityBlock);
|
|
|
|
+ invertedIndexTableTemp.put(char_, idEntityBlockMap);
|
|
|
|
+ }
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ Map<String, Set<EntityBlock>> invertedIndexTable = Maps.newHashMap();
|
|
|
|
+ invertedIndexTableTemp.forEach((char_, idEntityBlockMap) -> {
|
|
|
|
+ invertedIndexTable.put(char_, Sets.newHashSet(idEntityBlockMap.values()));
|
|
|
|
+ });
|
|
|
|
+
|
|
|
|
+ return invertedIndexTable;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 倒排索引生成流水线
|
|
|
|
+ * @return 倒排索引表
|
|
|
|
+ */
|
|
|
|
+ public Map<String, Set<EntityBlock>> generateInvertedIndexTablePipeline(){
|
|
|
|
+ filterEntitiesAndRelations();
|
|
|
|
+ List<List<Lemma>> entityGroups = mergeEntitiesByRelations();
|
|
|
|
+ List<EntityBlock> entityBlocks = setEntityBlocks(entityGroups);
|
|
|
|
+ return generateInvertedIndexTable(entityBlocks);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+}
|