Fix for empty Kuromoji user dictionary NPE (LUCENE-6468)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678685 13f79535-47bb-0310-9956-ffa450edef68
2015-05-11 08:03:56 +00:00 · 2015-05-11 08:03:56 +00:00 · 9d49a76d01
parent 216fd8d46a
commit 9d49a76d01
4 changed files with 44 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -162,6 +162,9 @@ Bug Fixes
 * LUCENE-6427: Added assertion about the presence of ghost bits in
  (Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)

+* LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary.
+  (Jun Ohtani via Christian Moen)
+
 API Changes

 * LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
          .onMalformedInput(CodingErrorAction.REPORT)
          .onUnmappableCharacter(CodingErrorAction.REPORT);
      Reader reader = new InputStreamReader(stream, decoder);
-      userDictionary = new UserDictionary(reader);
+      userDictionary = UserDictionary.open(reader);
    } else {
      userDictionary = null;
    }
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@ -56,18 +56,18 @@ public final class UserDictionary implements Dictionary {
  public static final int LEFT_ID = 5;
  
  public static final int RIGHT_ID = 5;
-  
-  public UserDictionary(Reader reader) throws IOException {
+
+  public static UserDictionary open(Reader reader) throws IOException {
+
    BufferedReader br = new BufferedReader(reader);
    String line = null;
-    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    List<String[]> featureEntries = new ArrayList<>();
- 
+
    // text, segmentation, readings, POS
    while ((line = br.readLine()) != null) {
      // Remove comments
      line = line.replaceAll("#.*$", "");
-      
+
      // Skip empty lines or comment lines
      if (line.trim().length() == 0) {
        continue;
@ -75,7 +75,17 @@ public final class UserDictionary implements Dictionary {
      String[] values = CSVUtil.parse(line);
      featureEntries.add(values);
    }
-    
+
+    if (featureEntries.isEmpty()) {
+      return null;
+    } else {
+      return new UserDictionary(featureEntries);
+    }
+  }
+
+  private UserDictionary(List<String[]> featureEntries) throws IOException {
+
+    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
+import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.Random;

@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.UnicodeUtil;

-public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
+public class
+    TestJapaneseTokenizer extends BaseTokenStreamTestCase {

  public static UserDictionary readDict() {
    InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
@ -49,7 +51,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
    try {
      try {
        Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
-        return new UserDictionary(reader);
+        return UserDictionary.open(reader);
      } finally {
        is.close();
      }
@ -686,4 +688,24 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
                              new int[] { 1, 1, 1, 1, 1},
                              new int[] { 1, 1, 1, 1, 1});
  }
+
+  public void testEmptyUserDict() throws Exception {
+    Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
+    UserDictionary emptyDict = UserDictionary.open(emptyReader);
+
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+    };
+
+    assertAnalyzesTo(analyzer, "これは本ではない",
+        new String[]{"これ", "は", "本", "で", "は", "ない"},
+        new int[]{0, 2, 3, 4, 5, 6},
+        new int[]{2, 3, 4, 5, 6, 8}
+    );
+    analyzer.close();
+  }
 }