LUCENE-8933: Validate JapaneseTokenizer user dictionary entry (#809)

* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry if the concatenated segment is same as its surface form.
2019-08-14 12:04:52 +09:00 · 2019-08-14 12:04:52 +09:00 · 73ba88a50d
parent 9e6047331b
commit 73ba88a50d
4 changed files with 38 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -33,6 +33,8 @@ API Changes
 * LUCENE-8948: Change "name" argument in ICU factories to "form". Here, "form" is
  named after "Unicode Normalization Form". (Tomoko Uchida)

+* LUCENE-8933: Validate JapaneseTokenizer user dictionary entry. (Tomoko Uchida)
+
 Improvements

 * LUCENE-8757: When provided with an ExecutorService to run queries across
--- a/lucene/MIGRATE.txt
+++ b/lucene/MIGRATE.txt
@ -1,5 +1,17 @@
 # Apache Lucene Migration Guide

+## Kuromoji user dictionary now forbids illegal segmentation (LUCENE-8933) ##
+
+User dictionary now strictly validates if the (concatenated) segment is the same as the surface form. This change avoids
+unexpected runtime exceptions or behaviours.
+For example, these entries are not allowed at all and an exception is thrown when loading the dictionary file.
+
+# concatenated "日本経済新聞" does not match the surface form "日経新聞"
+日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
+
+# concatenated "日経新聞" does not match the surface form "日本経済新聞"
+日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞
+
 ## Analysis factories now have customizable symbolic names (LUCENE-8778) ##

 The SPI names for concrete subclasses of TokenizerFactory, TokenFilterFactory, and CharfilterFactory are no longer
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@ -104,6 +104,8 @@ public final class UserDictionary implements Dictionary {
    long ord = 0;
    
    for (String[] values : featureEntries) {
+      String surface = values[0].replaceAll("\\s", "");
+      String concatenatedSegment = values[1].replaceAll("\\s", "");
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];
@ -113,6 +115,12 @@ public final class UserDictionary implements Dictionary {
                                   " - the number of segmentations (" + segmentation.length + ")" +
                                   " does not the match number of readings (" + readings.length + ")");
      }
+
+      if (!surface.equals(concatenatedSegment)) {
+        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
+                                   " - the concatenated segmentation (" + concatenatedSegment + ")" +
+                                   " does not match the surface form (" + surface + ")");
+      }
      
      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.ja.dict;


 import java.io.IOException;
+import java.io.StringReader;

 import org.apache.lucene.analysis.ja.TestJapaneseTokenizer;
 import org.apache.lucene.util.LuceneTestCase;
@ -77,4 +78,19 @@ public class UserDictionaryTest extends LuceneTestCase {
    UserDictionary dictionary = TestJapaneseTokenizer.readDict();
    assertNotNull(dictionary);
  }
+
+  @Test(expected = RuntimeException.class)
+  public void testReadInvalid1() throws IOException {
+    // the concatenated segment must be the same as the surface form
+    String invalidEntry = "日経新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞";
+    UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
+  }
+
+  @Test(expected = RuntimeException.class)
+  public void testReadInvalid2() throws IOException {
+    // the concatenated segment must be the same as the surface form
+    String invalidEntry = "日本経済新聞,日経 新聞,ニッケイ シンブン,カスタム名詞";
+    UserDictionary dictionary = UserDictionary.open(new StringReader(invalidEntry));
+  }
+
 }