mirror of https://github.com/apache/lucene.git
LUCENE-8933: Validate JapaneseTokenizer user dictionary entry (#809)
* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry if the concatenated segment is same as its surface form.
This commit is contained in:
parent
9e6047331b
commit
73ba88a50d
|
@ -33,6 +33,8 @@ API Changes
|
|||
* LUCENE-8948: Change "name" argument in ICU factories to "form". Here, "form" is
|
||||
named after "Unicode Normalization Form". (Tomoko Uchida)
|
||||
|
||||
* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry. (Tomoko Uchida)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-8757: When provided with an ExecutorService to run queries across
|
||||
|
|
|
@ -1,5 +1,17 @@
|
|||
# Apache Lucene Migration Guide
|
||||
|
||||
## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
|
||||
|
||||
User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
|
||||
unexpected runtime exceptions or behaviours.
|
||||
For example, these entries are not allowed at all and an exception is thrown when loading the dictionary file.
|
||||
|
||||
# concatenated "日本経済新聞" does not match the surface form "日経新聞"
|
||||
日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
|
||||
|
||||
# concatenated "日経新聞" does not match the surface form "日本経済新聞"
|
||||
日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞
|
||||
|
||||
## Analysis factories now have customizable symbolic names (LUCENE-8778) ##
|
||||
|
||||
The SPI names for concrete subclasses of TokenizerFactory, TokenFilterFactory, and CharfilterFactory are no longer
|
||||
|
|
|
@ -104,6 +104,8 @@ public final class UserDictionary implements Dictionary {
|
|||
long ord = 0;
|
||||
|
||||
for (String[] values : featureEntries) {
|
||||
String surface = values[0].replaceAll("\\s", "");
|
||||
String concatenatedSegment = values[1].replaceAll("\\s", "");
|
||||
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
|
||||
String[] readings = values[2].replaceAll(" *", " ").split(" ");
|
||||
String pos = values[3];
|
||||
|
@ -113,6 +115,12 @@ public final class UserDictionary implements Dictionary {
|
|||
" - the number of segmentations (" + segmentation.length + ")" +
|
||||
" does not the match number of readings (" + readings.length + ")");
|
||||
}
|
||||
|
||||
if (!surface.equals(concatenatedSegment)) {
|
||||
throw new RuntimeException("Illegal user dictionary entry " + values[0] +
|
||||
" - the concatenated segmentation (" + concatenatedSegment + ")" +
|
||||
" does not match the surface form (" + surface + ")");
|
||||
}
|
||||
|
||||
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
|
||||
wordIdAndLength[0] = wordId;
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;
|
|||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.ja.TestJapaneseTokenizer;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -77,4 +78,19 @@ public class UserDictionaryTest extends LuceneTestCase {
|
|||
UserDictionary dictionary = TestJapaneseTokenizer.readDict();
|
||||
assertNotNull(dictionary);
|
||||
}
|
||||
|
||||
@Test(expected = RuntimeException.class)
|
||||
public void testReadInvalid1() throws IOException {
|
||||
// the concatenated segment must be the same as the surface form
|
||||
String invalidEntry = "日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞";
|
||||
UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
|
||||
}
|
||||
|
||||
@Test(expected = RuntimeException.class)
|
||||
public void testReadInvalid2() throws IOException {
|
||||
// the concatenated segment must be the same as the surface form
|
||||
String invalidEntry = "日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞";
|
||||
UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue