Browse Source

修改分词BUG

louhr 6 years ago
parent
commit
4fbc34f81b

+ 6 - 1
nlp/src/main/java/org/diagbot/nlp/participle/ParticipleToken.java

@@ -105,7 +105,12 @@ public class ParticipleToken {
             this.matchCHN(segment, cursor, 0, false);
             //不成词所有英文字符拼接
             if (cursor - offset <= 1) {
-                this.matchENGLISH();
+                if (this.property == null || ARABIC_PROPERTY.equals(this.property)) {
+                    this.matchENGLISH();
+                } else {
+                    this.addLexeme(offset, cursor - offset, this.property);
+                    this.offset = this.cursor;
+                }
             } else {
                 this.addLexeme(offset, cursor - offset, this.property);
                 this.offset = this.cursor;

+ 8 - 0
nlp/src/main/java/org/diagbot/nlp/participle/ParticipleUtil.java

@@ -142,6 +142,14 @@ public class ParticipleUtil {
         if (index > 0) {
             index--;
             last_l = lexemes.get(index);
+            if ("×".equals(last_l.getText()) && index > 0) {
+                lexeme.setOffset(last_l.getOffset());
+                lexeme.setLength(last_l.getLength() + lexeme.getLength());
+                lexeme.setText(last_l.getText() + lexeme.getText());
+                results.remove(results.size() - 1);
+                index--;
+                last_l = lexemes.get(index);
+            }
             if (NlpUtil.isNumberString(last_l)) {
                 lexeme.setOffset(last_l.getOffset());
                 lexeme.setLength(last_l.getLength() + lexeme.getLength());

+ 7 - 0
nlp/src/main/resources/push-tc.dict

@@ -613,6 +613,7 @@ C18qk9+dMlSia0min0kp8g==
 3d/WjynA3ZE=
 s19RNBU9menPe2hcTTjvZMaXZipQ6CyIa6XEl+YvnKc=
 DsQ6OHDgaIfAYqmG4uHerw==
+wfG2JAfrKms=
 /FM0Pm8mfPoxczT06aKb9/LMud23gYLLomtJop9JKfI=
 MO/5GZZnifU=
 M0V+L98q5enEXnmZwz/S9A==
@@ -15384,6 +15385,7 @@ uWYYc3QGf8YgF1RKveuxgEqmgCoqmRx4
 +cXaBAXkA2IT/c0Q/XovDA==
 a6gckjqporLVnZBXpF4gPQ==
 /FLxB86zFvD9qys/bhtPqA==
+/FLxB86zFvCkjvAXo6FDbw==
 2hGyDy6RqyT2+I4lpp4e5g==
 6PWqwSU8wtqwPP9RJ3ntdA==
 6PWqwSU8wto1Njn0B9xAiMyizcisX1Ri
@@ -18898,6 +18900,7 @@ FOxZHHSSTYVpOeSDCuauhw==
 FOxZHHSSTYVZQ5MV9gzneg==
 ZqFa8Cs3hGBpb/c9CGc9Hg==
 D4bNvKIVB4BFN3Uoy4yfrw==
+D4bNvKIVB4BtuTwGwfvCMyUqSkw4UW0z
 siiTSr7NOol/hHBLqnfJDQ==
 siiTSr7NOom7IR1CcIW7jQ==
 NzI29j1jHH+gyLBYFhE+PQ==
@@ -23245,6 +23248,7 @@ M86lo9FehGQeI0aYXePUfERygBGfeUG66Lq91temWrE=
 M86lo9FehGQeI0aYXePUfKLWkS+co0jZwGKphuLh3q8=
 xtHcS4zBOjoDbKulPgUFQw6sSBa7POAd1+u5zxjgAp/a43eSz9L0tg==
 TZWtwueKrfxvlaanwRGZ7w==
+uoqauMcrQITEcLs+8pscyA==
 uoqauMcrQIR0NEfUm4cCe3Lm8EIUKRZm
 uoqauMcrQITU52Qr5eHfse5YezfIb8+ZP1EmYQhTg+90AfxWInngYw==
 uoqauMcrQIRAF20tEH15i/ztC0tMarB2SmARnfpRl/s=
@@ -24040,6 +24044,7 @@ SnpC72g3l+eiw+KKDGHU+joHoy7uDJrR
 SnpC72g3l+dZpcr0plk9KK9UmcRm6vkk
 SnpC72g3l+ezvhyhWJ2uqEQNp9ylzA0r/0m5yTkB2mk=
 SnpC72g3l+dC98nxzkFsD69UmcRm6vkk
+p2HQykvAKzIQM2dnqFlPog==
 Bugt8EYpa38ToqtXFzcysA==
 FSo4yXsyhSZpOeSDCuauhw==
 FSo4yXsyhSYm4zHcg/e8MPZtvjHT9QiU
@@ -33799,6 +33804,7 @@ FLiSnjOnVbodPRvFHCA+uiiTdCJGfNBrSmARnfpRl/s=
 +dGJNGSnaFG14lDAP8dXSI9Gm3kSRuFMSmARnfpRl/s=
 SPbO7yoLjss+jowRk1soEypfVcjw8RT+/0m5yTkB2mk=
 SPbO7yoLjss+jowRk1soE+NZt7QAFzUfOdMWg3lvN5g=
+PcbVghssnoFETfQw8af7A9y74GHJGT6vSmARnfpRl/s=
 n6bTRApU5HSw2Os3nDhS/K9UmcRm6vkk
 n6bTRApU5HSyRkJnuFAqwhBfvgl0SI6vSmARnfpRl/s=
 n6bTRApU5HTdai6MgydPm1jrgEKY4IyD/0m5yTkB2mk=
@@ -54700,6 +54706,7 @@ J/+qX2SUyrlpb/c9CGc9Hg==
 RnkjKy7bTuaHSoHBmV4iZA==
 RnkjKy7bTuYJt9ie6Dtkn3Lm8EIUKRZm
 RnkjKy7bTuaIbnY0/MsG2LkxqNaB+sgQSmARnfpRl/s=
+RnkjKy7bTuZljf2GHyf4mQPZnLTJIPKK
 RnkjKy7bTuYOw4yCqX+MJgPZnLTJIPKK
 RnkjKy7bTuYzEYPct5eSGFUWUz1oJvxh
 RnkjKy7bTuauJDbDkUxjoAPZnLTJIPKK

+ 7 - 0
nlp/src/main/resources/tc.dict

@@ -1095,6 +1095,7 @@ N0KyOCGIJsHuRAmaKb9XfaJrSaKfSSny
 s19RNBU9menPe2hcTTjvZMaXZipQ6CyIHIGL3TVQTHiia0min0kp8g==
 DsQ6OHDgaId6xtMh07OTFQ==
 D/198s7vDcJAqs2j2AfBag==
+bwNN0VOyWuZKYBGd+lGX+w==
 /FM0Pm8mfPoxczT06aKb90isv5GNKnLwZ+Vo+M4S6bc=
 0k+Y7tbSGq8=
 llwaaJUOPvMQM2dnqFlPog==
@@ -94074,6 +94075,7 @@ HDGylU4rN7HuXvlrwBJOskT4i8PzYhUZ
 rQbw0PBkCSkaaaE3kPheoUpgEZ36UZf7
 NK5/9OdQDluWWztDgDbFbWrXI80Mitm6aW/3PQhnPR4=
 /FLxB86zFvAj4C5tb+oA5MWkjbk6+6De
+/FLxB86zFvDisQKU3O65MCyrN35xm8Fe
 /FLxB86zFvCs5t/2i6NyyC1TT/wzQpxQ
 x4/dlUy0c+Ysuj2laB5T58ZvhQC9AxJX
 x4/dlUy0c+YiTyM/LTzfSMZvhQC9AxJX
@@ -116897,6 +116899,7 @@ FOxZHHSSTYUv0dwzXmBiHA==
 FOxZHHSSTYVDhxQDMKFf1UpgEZ36UZf7
 eafO4xMgBUvqU/5hE0/sxA==
 D4bNvKIVB4DWStAhwlD3rQ==
+D4bNvKIVB4BtuTwGwfvCM2xz/XS7NAvJ
 Q/JnyLPLpmothL8O2n0qMQ==
 siiTSr7NOonvlOAZdWEeGUpgEZ36UZf7
 siiTSr7NOokraPZMVn+4H0pgEZ36UZf7
@@ -140370,6 +140373,7 @@ xtHcS4zBOjoDbKulPgUFQw6sSBa7POAd1+u5zxjgAp9uyQaIL0CkWg==
 TZWtwueKrfzUnqb3YNRU3Q==
 TZWtwueKrfwNL7FJC0OmyhAzZ2eoWU+i
 TZWtwueKrfwjye1bL3YkKA==
+uoqauMcrQITjzRAY9OHCcg==
 uoqauMcrQIR0NEfUm4cCe7QivoghFYAD
 uoqauMcrQITU52Qr5eHfse5YezfIb8+ZP1EmYQhTg++xGULOIEx4IRAzZ2eoWU+i
 uoqauMcrQIRAF20tEH15i/b7PrDqB3PbcubwQhQpFmY=
@@ -147339,6 +147343,7 @@ nto+jnbaNApqQjanXsrwMw==
 5q6LzktURwRqQjanXsrwMw==
 eLfd5b5tyb5SLHgFIbWASxAzZ2eoWU+i
 +rpJYnAmTLNqQjanXsrwMw==
+p2HQykvAKzJqQjanXsrwMw==
 HWO4etC3TY1qQjanXsrwMw==
 /cHnTBgp+7lqQjanXsrwMw==
 z95DwzxgJ//C/JZbVqU+lxAzZ2eoWU+i
@@ -223021,6 +223026,7 @@ FLiSnjOnVbodPRvFHCA+ut/ESTlV6THIOgejLu4MmtE=
 +dGJNGSnaFG14lDAP8dXSHAoBb27qLgoOgejLu4MmtE=
 SPbO7yoLjss+jowRk1soEypfVcjw8RT+41RB8ykuU5I=
 SPbO7yoLjss+jowRk1soE+NZt7QAFzUfUJJnFEqU3/BKYBGd+lGX+w==
+PcbVghssnoFETfQw8af7A3CFGrtHZEpBOgejLu4MmtE=
 n6bTRApU5HSw2Os3nDhS/MzHjQobrZhl
 n6bTRApU5HSyRkJnuFAqwoaS9d1JMN8FOgejLu4MmtE=
 n6bTRApU5HTdai6MgydPm1jrgEKY4IyD41RB8ykuU5I=
@@ -327002,6 +327008,7 @@ RnkjKy7bTuYJt9ie6Dtkn7QivoghFYAD
 RnkjKy7bTuaIbnY0/MsG2HBO0HUkJijtcubwQhQpFmY=
 RnkjKy7bTuYnSqWLOTSdbdp9HMGBxd4R
 RnkjKy7bTuYpXy1P6RB+Kw1eSoEoetEG
+RnkjKy7bTuZljf2GHyf4mfNNEaI2FB0k
 RnkjKy7bTuYOw4yCqX+MJvNNEaI2FB0k
 RnkjKy7bTuYzEYPct5eSGA9RBtG3+EUq
 RnkjKy7bTuauJDbDkUxjoPNNEaI2FB0k