|
@@ -37,11 +37,12 @@ public class ParticipleToken {
|
|
|
private Reader reader;
|
|
|
//词库二叉树
|
|
|
private Segment segment;
|
|
|
-
|
|
|
//词频次
|
|
|
private float threshold;
|
|
|
//词性
|
|
|
private String property;
|
|
|
+ //标准词
|
|
|
+ private String concept;
|
|
|
//缺省词性
|
|
|
private String DEFAULT_PROPERTY = "99";
|
|
|
|
|
@@ -87,7 +88,7 @@ public class ParticipleToken {
|
|
|
if (cursor - offset <= 1) {
|
|
|
this.matchARABIC();
|
|
|
} else {
|
|
|
- this.addLexeme(offset, cursor - offset, this.property);
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
this.offset = this.cursor;
|
|
|
}
|
|
|
} else if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_CHINESE_QUANTIFIER) {
|
|
@@ -97,7 +98,7 @@ public class ParticipleToken {
|
|
|
if (cursor - offset <= 1) {
|
|
|
this.matchQUANTIFIER();
|
|
|
} else {
|
|
|
- this.addLexeme(offset, cursor - offset, this.property);
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
this.offset = this.cursor;
|
|
|
}
|
|
|
} else if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_ENGLISH) {
|
|
@@ -108,11 +109,11 @@ public class ParticipleToken {
|
|
|
if (this.property == null || ARABIC_PROPERTY.equals(this.property)) {
|
|
|
this.matchENGLISH();
|
|
|
} else {
|
|
|
- this.addLexeme(offset, cursor - offset, this.property);
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
this.offset = this.cursor;
|
|
|
}
|
|
|
} else {
|
|
|
- this.addLexeme(offset, cursor - offset, this.property);
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
this.offset = this.cursor;
|
|
|
}
|
|
|
}
|
|
@@ -129,7 +130,7 @@ public class ParticipleToken {
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
- this.addLexeme(this.offset, this.cursor - this.offset, "33");
|
|
|
+ this.addLexeme(this.offset, this.cursor - this.offset, "33", "");
|
|
|
this.offset = this.cursor;
|
|
|
}
|
|
|
|
|
@@ -155,7 +156,7 @@ public class ParticipleToken {
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
- this.addLexeme(this.offset, this.cursor - this.offset, this.ARABIC_PROPERTY);
|
|
|
+ this.addLexeme(this.offset, this.cursor - this.offset, this.ARABIC_PROPERTY, "");
|
|
|
this.offset = this.cursor;
|
|
|
}
|
|
|
|
|
@@ -173,7 +174,7 @@ public class ParticipleToken {
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
- this.addLexeme(this.offset, this.cursor - this.offset, this.ARABIC_PROPERTY);
|
|
|
+ this.addLexeme(this.offset, this.cursor - this.offset, this.ARABIC_PROPERTY, "");
|
|
|
this.offset = this.cursor;
|
|
|
}
|
|
|
|
|
@@ -223,6 +224,7 @@ public class ParticipleToken {
|
|
|
//保存词频数据,便于最大长度相同时 按词频排序
|
|
|
this.threshold = s.getThreshold();
|
|
|
this.property = s.getProperty();
|
|
|
+ this.concept = s.getConcept();
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -242,6 +244,7 @@ public class ParticipleToken {
|
|
|
int max_begin = o_offset; //最大长度词默认起始位置为前一个词开始位置
|
|
|
//词性会被递归覆盖,需要先保存
|
|
|
String property = this.property;
|
|
|
+ String concept = this.concept;
|
|
|
while (o_begin < o_cursor - 1) {
|
|
|
o_begin++;
|
|
|
this.matchCHN(segment, o_begin, 0, false); //执行后this.cursor会变化
|
|
@@ -249,17 +252,20 @@ public class ParticipleToken {
|
|
|
max_length = this.cursor - o_begin;
|
|
|
max_begin = o_begin;
|
|
|
property = this.property;
|
|
|
+ concept = this.concept;
|
|
|
} else if (this.cursor - o_begin == max_length) { //词长度相同 有词性的保留
|
|
|
if (this.DEFAULT_PROPERTY.equals(property)
|
|
|
&& this.property != null && !this.DEFAULT_PROPERTY.equals(this.property)) {
|
|
|
// if (this.threshold > o_threshold) {
|
|
|
max_begin = o_begin;
|
|
|
property = this.property;
|
|
|
+ concept = this.concept;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
//还原词性
|
|
|
this.property = property;
|
|
|
+ this.concept = concept;
|
|
|
if (max_length >= o_length) { //如果发现有新词比最初切出来的词更长,那么先对新词前面字符进行切词,this.cursor应该被还原为最初的值,并且只能最多切到新词起始位置
|
|
|
if (max_begin != o_offset) {
|
|
|
this.cursor = this.offset;
|
|
@@ -267,7 +273,7 @@ public class ParticipleToken {
|
|
|
this.next();
|
|
|
} else {
|
|
|
this.cursor = o_cursor;
|
|
|
- this.addLexeme(offset, this.cursor - offset, this.property);
|
|
|
+ this.addLexeme(offset, this.cursor - offset, this.property, this.concept);
|
|
|
this.offset = this.cursor;
|
|
|
this.stop_position = this.text_size;
|
|
|
}
|
|
@@ -275,16 +281,17 @@ public class ParticipleToken {
|
|
|
}
|
|
|
|
|
|
private void addLexeme(int begin, int length) throws IOException {
|
|
|
- this.addLexeme(begin, length, this.DEFAULT_PROPERTY);
|
|
|
+ this.addLexeme(begin, length, this.DEFAULT_PROPERTY, "");
|
|
|
}
|
|
|
|
|
|
- private void addLexeme(int begin, int length, String property) throws IOException {
|
|
|
+ private void addLexeme(int begin, int length, String property, String concept) throws IOException {
|
|
|
if(length > 0){
|
|
|
Lexeme lexeme = new Lexeme(begin, length);
|
|
|
char[] chars = new char[lexeme.getLength()];
|
|
|
System.arraycopy(buffer, lexeme.getOffset(), chars, 0, lexeme.getLength());
|
|
|
lexeme.setText(String.valueOf(chars));
|
|
|
lexeme.setProperty(property==null?this.DEFAULT_PROPERTY:property);
|
|
|
+ lexeme.setConcept(concept);
|
|
|
lexemePath.add(lexeme);
|
|
|
//判断是否需要加载文本内容
|
|
|
this.fillBuffer();
|