LUCENE-8933: Validate JapaneseTokenizer user dictionary entry (#809)

* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry if the concatenated segment is same as its surface form.
This commit is contained in:
Tomoko Uchida 2019-08-14 12:04:52 +09:00 committed by GitHub
parent 9e6047331b
commit 73ba88a50d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 38 additions and 0 deletions

View File

@ -33,6 +33,8 @@ API Changes
* LUCENE-8948: Change "name" argument in ICU factories to "form". Here, "form" is
named after "Unicode Normalization Form". (Tomoko Uchida)
* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry. (Tomoko Uchida)
Improvements
* LUCENE-8757: When provided with an ExecutorService to run queries across

View File

@ -1,5 +1,17 @@
# Apache Lucene Migration Guide
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
unexpected runtime exceptions or behaviours.
For example, these entries are not allowed at all and an exception is thrown when loading the dictionary file.
# concatenated "日本経済新聞" does not match the surface form "日経新聞"
日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
# concatenated "日経新聞" does not match the surface form "日本経済新聞"
日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞
## Analysis factories now have customizable symbolic names (LUCENE-8778) ##
The SPI names for concrete subclasses of TokenizerFactory, TokenFilterFactory, and CharfilterFactory are no longer

View File

@ -104,6 +104,8 @@ public final class UserDictionary implements Dictionary {
long ord = 0;
for (String[] values : featureEntries) {
String surface = values[0].replaceAll("\\s", "");
String concatenatedSegment = values[1].replaceAll("\\s", "");
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
String[] readings = values[2].replaceAll(" *", " ").split(" ");
String pos = values[3];
@ -113,6 +115,12 @@ public final class UserDictionary implements Dictionary {
" - the number of segmentations (" + segmentation.length + ")" +
" does not the match number of readings (" + readings.length + ")");
}
if (!surface.equals(concatenatedSegment)) {
throw new RuntimeException("Illegal user dictionary entry " + values[0] +
" - the concatenated segmentation (" + concatenatedSegment + ")" +
" does not match the surface form (" + surface + ")");
}
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
wordIdAndLength[0] = wordId;

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.ja.TestJapaneseTokenizer;
import org.apache.lucene.util.LuceneTestCase;
@ -77,4 +78,19 @@ public class UserDictionaryTest extends LuceneTestCase {
UserDictionary dictionary = TestJapaneseTokenizer.readDict();
assertNotNull(dictionary);
}
@Test(expected = RuntimeException.class)
public void testReadInvalid1() throws IOException {
// the concatenated segment must be the same as the surface form
String invalidEntry = "日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞";
UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
}
@Test(expected = RuntimeException.class)
public void testReadInvalid2() throws IOException {
// the concatenated segment must be the same as the surface form
String invalidEntry = "日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞";
UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
}
}