|
@@ -0,0 +1,358 @@
|
|
|
+package com.lantone.qc.nlp.participle;
|
|
|
+
|
|
|
+import com.lantone.qc.nlp.participle.word.Lexeme;
|
|
|
+import com.lantone.qc.nlp.participle.word.LexemePath;
|
|
|
+import com.lantone.qc.nlp.participle.word.Segment;
|
|
|
+import com.lantone.qc.nlp.util.CharacterUtil;
|
|
|
+import com.lantone.qc.nlp.util.Constants;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.io.Reader;
|
|
|
+import java.util.Arrays;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @Auther: fyeman
|
|
|
+ * @Date: 2018.06.08 15:24
|
|
|
+ * @Description: 分词算法 最大长度匹配法
|
|
|
+ */
|
|
|
+public class ParticipleToken {
|
|
|
+ //每次读取最大长度
|
|
|
+ private int BUFFER_SIZE = 4096;
|
|
|
+ //缓冲区耗尽的临界值
|
|
|
+ private int BUFFER_CRITICAL_VALUE = 128;
|
|
|
+ //每次读进预处理数据包
|
|
|
+ private char[] buffer;
|
|
|
+ //最新词开始位置
|
|
|
+ private int offset = 0;
|
|
|
+ //游标
|
|
|
+ private int cursor = 0;
|
|
|
+ //可处理的字符串长度
|
|
|
+ private int available = 0;
|
|
|
+ //读取文本总长豆
|
|
|
+ private int text_size = 0;
|
|
|
+ //分词截止位置
|
|
|
+ private int stop_position = 0;
|
|
|
+ //字符不会被打断 即使词库中未找到对应词
|
|
|
+ private char[] join_symbols = new char[]{'~', '-', '/', '.', '*', '^', '+'};
|
|
|
+ //待分词数据包
|
|
|
+ private Reader reader;
|
|
|
+ //词库二叉树
|
|
|
+ private Segment segment;
|
|
|
+ //词频次
|
|
|
+ private float threshold;
|
|
|
+ //词性
|
|
|
+ private String property;
|
|
|
+ //标准词
|
|
|
+ private String concept;
|
|
|
+ //缺省词性
|
|
|
+ private String DEFAULT_PROPERTY = Constants.word_property_other;
|
|
|
+
|
|
|
+ private String ARABIC_PROPERTY = Constants.word_property_number;
|
|
|
+
|
|
|
+ private LexemePath<Lexeme> lexemePath;
|
|
|
+
|
|
|
+ public ParticipleToken() {
|
|
|
+ //不排序 无法使用search方法
|
|
|
+ Arrays.sort(join_symbols);
|
|
|
+ }
|
|
|
+
|
|
|
+ public void start(Reader reader, Segment segment) throws IOException {
|
|
|
+ this.reader = reader;
|
|
|
+ this.segment = segment;
|
|
|
+ this.buffer = null;
|
|
|
+ this.offset = 0;
|
|
|
+ this.cursor = 0;
|
|
|
+ this.lexemePath = new LexemePath<Lexeme>();
|
|
|
+
|
|
|
+ this.fillBuffer();
|
|
|
+ this.participle();
|
|
|
+ }
|
|
|
+
|
|
|
+ public void participle() throws IOException {
|
|
|
+ while (hasNext()) {
|
|
|
+ this.next();
|
|
|
+ this.property = null;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void next() throws IOException {
|
|
|
+ if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_CHINESE
|
|
|
+ || CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_USELESS
|
|
|
+ || CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_OTHER_HALF) {
|
|
|
+ this.matchCHN(segment, cursor, 0, false);
|
|
|
+ //最大长度重新计算当前分词是否合理
|
|
|
+ this.validate();
|
|
|
+ } else if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_ARABIC) {
|
|
|
+ //先依据词库是否成词判断
|
|
|
+ this.matchCHN(segment, cursor, 0, false);
|
|
|
+ //不成词量词拼接
|
|
|
+ if (cursor - offset <= 1) {
|
|
|
+ this.matchARABIC();
|
|
|
+ } else {
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
+ this.offset = this.cursor;
|
|
|
+ }
|
|
|
+ } else if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_CHINESE_QUANTIFIER) {
|
|
|
+ //先依据词库是否成词判断
|
|
|
+ this.matchCHN(segment, cursor, 0, false);
|
|
|
+ //不成词量词拼接
|
|
|
+ if (cursor - offset <= 1) {
|
|
|
+ this.matchQUANTIFIER();
|
|
|
+ } else {
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
+ this.offset = this.cursor;
|
|
|
+ }
|
|
|
+ } else if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_ENGLISH) {
|
|
|
+ //先依据词库是否成词判断
|
|
|
+ this.matchCHN(segment, cursor, 0, false);
|
|
|
+ //不成词所有英文字符拼接
|
|
|
+ if (cursor - offset <= 1) {
|
|
|
+ if (this.property == null || ARABIC_PROPERTY.equals(this.property)) {
|
|
|
+ this.matchENGLISH();
|
|
|
+ } else {
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
+ this.offset = this.cursor;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ this.addLexeme(offset, cursor - offset, this.property, this.concept);
|
|
|
+ this.offset = this.cursor;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void matchQUANTIFIER() throws IOException {
|
|
|
+ int position;
|
|
|
+ while (cursor < available) {
|
|
|
+ position = Arrays.binarySearch(join_symbols, buffer[cursor]);
|
|
|
+ if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_CHINESE_QUANTIFIER //中文量词
|
|
|
+ || position > -1) {
|
|
|
+ cursor++;
|
|
|
+ } else {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ this.addLexeme(this.offset, this.cursor - this.offset, "33", "");
|
|
|
+ this.offset = this.cursor;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void matchARABIC() throws IOException {
|
|
|
+ int position;
|
|
|
+ int cur_offset = this.offset;
|
|
|
+ while (cursor < available) {
|
|
|
+ position = Arrays.binarySearch(join_symbols, buffer[cursor]);
|
|
|
+ if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_ARABIC //数字后跟'-'、'/'、'.'作为数字处理
|
|
|
+// || CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_ENGLISH
|
|
|
+ || position > -1) {
|
|
|
+// cursor++;
|
|
|
+ cur_offset++;
|
|
|
+ //先依据词库是否成词判断
|
|
|
+ this.matchCHN(segment, cursor, 0, false);
|
|
|
+ if (cursor - cur_offset > 1) {
|
|
|
+ this.cursor = cur_offset;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ } else if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_CHINESE) { //数字后跟中文单位
|
|
|
+ break;
|
|
|
+ } else {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ this.addLexeme(this.offset, this.cursor - this.offset, this.ARABIC_PROPERTY, "");
|
|
|
+ this.offset = this.cursor;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void matchENGLISH() throws IOException {
|
|
|
+ int position;
|
|
|
+ while (cursor < available) {
|
|
|
+ position = Arrays.binarySearch(join_symbols, buffer[cursor]);
|
|
|
+ if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_ENGLISH //英文拼接
|
|
|
+ || position > -1
|
|
|
+ || CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_ARABIC) {
|
|
|
+ cursor++;
|
|
|
+ } else if (CharacterUtil.identifyCharType(buffer[cursor]) == CharacterUtil.CHAR_CHINESE) { //英文后跟中文单位
|
|
|
+ break;
|
|
|
+ } else {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ this.addLexeme(this.offset, this.cursor - this.offset, this.ARABIC_PROPERTY, "");
|
|
|
+ this.offset = this.cursor;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * @param s
|
|
|
+ * @param begin
|
|
|
+ * @param length
|
|
|
+ * @param status 递归标示,部分字打头的词在字典中有可能不存在,此时应认为该字是一个词,offset+1,进入递归方法前将值设置为true
|
|
|
+ */
|
|
|
+ private void matchCHN(Segment s, int begin, int length, boolean status) {
|
|
|
+ //分词停止位置
|
|
|
+ if (begin >= stop_position) {
|
|
|
+ this.cutLexeme(s, begin, length, status);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ Character character = Character.valueOf(buffer[begin]);
|
|
|
+ Segment character_segment = s.getSegmentMap().get(character);
|
|
|
+ if (character_segment != null) {
|
|
|
+ status = true;
|
|
|
+ matchCHN(character_segment, begin + 1, length + 1, status);
|
|
|
+ } else {
|
|
|
+ this.cutLexeme(s, begin, length, status);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 切词
|
|
|
+ *
|
|
|
+ * @param s
|
|
|
+ * @param begin
|
|
|
+ * @param length
|
|
|
+ * @param status
|
|
|
+ */
|
|
|
+ private void cutLexeme(Segment s, int begin, int length, boolean status) {
|
|
|
+ if (s.isLexeme() == false && length > 1) {//不成词进行回溯 二叉树每条路径节点不一定成词
|
|
|
+ do {
|
|
|
+ s = s.parent();
|
|
|
+ begin--;
|
|
|
+ length--;
|
|
|
+ } while (s != null && s.isLexeme() == false && length > 1);
|
|
|
+ }
|
|
|
+ if (!status) { //字典库不存在该字 且不是unit_segment
|
|
|
+ this.cursor++;
|
|
|
+ } else {
|
|
|
+ this.cursor = begin;
|
|
|
+ }
|
|
|
+ //保存词频数据,便于最大长度相同时 按词频排序
|
|
|
+ this.threshold = s.getThreshold();
|
|
|
+ this.property = s.getProperty();
|
|
|
+ this.concept = s.getConcept();
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 分词规则:1、最大词长度匹配
|
|
|
+ * 2、相同词长度 按频次优先
|
|
|
+ * 每个词被切出后需要矫正,以便找出最大长度的词
|
|
|
+ */
|
|
|
+ private void validate() throws IOException {
|
|
|
+ int o_offset = this.offset;
|
|
|
+ int o_cursor = this.cursor; //切出来的词 暂时游标位置
|
|
|
+ float o_threshold = this.threshold;
|
|
|
+
|
|
|
+ int o_begin = o_offset;
|
|
|
+ int o_length = o_cursor - o_offset;
|
|
|
+ //当前词长度
|
|
|
+ int max_length = o_length;
|
|
|
+ int max_begin = o_offset; //最大长度词默认起始位置为前一个词开始位置
|
|
|
+ //词性会被递归覆盖,需要先保存
|
|
|
+ String property = this.property;
|
|
|
+ String concept = this.concept;
|
|
|
+ while (o_begin < o_cursor - 1) {
|
|
|
+ o_begin++;
|
|
|
+ this.matchCHN(segment, o_begin, 0, false); //执行后this.cursor会变化
|
|
|
+ if (this.cursor - o_begin > max_length) {
|
|
|
+ max_length = this.cursor - o_begin;
|
|
|
+ max_begin = o_begin;
|
|
|
+ property = this.property;
|
|
|
+ concept = this.concept;
|
|
|
+ } else if (this.cursor - o_begin == max_length) { //词长度相同 有词性的保留
|
|
|
+ if (this.DEFAULT_PROPERTY.equals(property)
|
|
|
+ && this.property != null && !this.DEFAULT_PROPERTY.equals(this.property)) {
|
|
|
+// if (this.threshold > o_threshold) {
|
|
|
+ max_begin = o_begin;
|
|
|
+ property = this.property;
|
|
|
+ concept = this.concept;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //还原词性
|
|
|
+ this.property = property;
|
|
|
+ this.concept = concept;
|
|
|
+ if (max_length >= o_length) { //如果发现有新词比最初切出来的词更长,那么先对新词前面字符进行切词,this.cursor应该被还原为最初的值,并且只能最多切到新词起始位置
|
|
|
+ if (max_begin != o_offset) {
|
|
|
+ this.cursor = this.offset;
|
|
|
+ this.stop_position = max_begin;
|
|
|
+ this.next();
|
|
|
+ } else {
|
|
|
+ this.cursor = o_cursor;
|
|
|
+ this.addLexeme(offset, this.cursor - offset, this.property, this.concept);
|
|
|
+ this.offset = this.cursor;
|
|
|
+ this.stop_position = this.text_size;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void addLexeme(int begin, int length) throws IOException {
|
|
|
+ this.addLexeme(begin, length, this.DEFAULT_PROPERTY, "");
|
|
|
+ }
|
|
|
+
|
|
|
+ private void addLexeme(int begin, int length, String property, String concept) throws IOException {
|
|
|
+ if(length > 0){
|
|
|
+ Lexeme lexeme = new Lexeme(begin, length);
|
|
|
+ char[] chars = new char[lexeme.getLength()];
|
|
|
+ System.arraycopy(buffer, lexeme.getOffset(), chars, 0, lexeme.getLength());
|
|
|
+ lexeme.setText(String.valueOf(chars));
|
|
|
+ lexeme.setProperty(property==null?this.DEFAULT_PROPERTY:property);
|
|
|
+ lexeme.setConcept(concept);
|
|
|
+ lexemePath.add(lexeme);
|
|
|
+ //判断是否需要加载文本内容
|
|
|
+ this.fillBuffer();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public boolean hasNext() {
|
|
|
+ return this.available > 0
|
|
|
+ && this.cursor < this.available;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 是否读取新缓存判断判断
|
|
|
+ *
|
|
|
+ * @return
|
|
|
+ */
|
|
|
+ private boolean isLoadReader() {
|
|
|
+ return this.available == BUFFER_SIZE
|
|
|
+ && this.cursor < this.available - 1
|
|
|
+ && this.cursor > this.available - BUFFER_CRITICAL_VALUE;
|
|
|
+ }
|
|
|
+
|
|
|
+ private void fillBuffer() throws IOException {
|
|
|
+ if (buffer == null) {
|
|
|
+ buffer = new char[BUFFER_SIZE];
|
|
|
+ available = reader.read(buffer);
|
|
|
+ this.text_size = available;
|
|
|
+ } else {
|
|
|
+ if (isLoadReader()) {
|
|
|
+ int unreader_length = this.available - this.cursor;
|
|
|
+ //将还未进行分词的内容拷贝到数据包最前面
|
|
|
+ System.arraycopy(this.buffer, this.cursor, this.buffer, 0, unreader_length);
|
|
|
+ //最新数据包长度=新读取数据包长度+未解析完长度
|
|
|
+ available = reader.read(this.buffer, unreader_length, BUFFER_SIZE - unreader_length) + unreader_length;
|
|
|
+ //当前读取文本总长度
|
|
|
+ this.text_size = this.cursor + this.available;
|
|
|
+ //重置偏移量
|
|
|
+ this.cursor = 0;
|
|
|
+ this.offset = 0;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //默认读取到文本尾
|
|
|
+ this.stop_position = this.text_size;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void end() {
|
|
|
+ if (reader != null) {
|
|
|
+ try {
|
|
|
+ reader.close();
|
|
|
+ } catch (IOException ioe) {
|
|
|
+ ioe.printStackTrace();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public LexemePath<Lexeme> getLexemePath() {
|
|
|
+ return lexemePath;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setLexemePath(LexemePath<Lexeme> lexemePath) {
|
|
|
+ this.lexemePath = lexemePath;
|
|
|
+ }
|
|
|
+}
|