Fix for empty Kuromoji user dictionary NPE (LUCENE-6468)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678685 13f79535-47bb-0310-9956-ffa450edef68
2015-05-11 08:03:56 +00:00 · 2015-05-11 08:03:56 +00:00 · 9d49a76d01
parent 216fd8d46a
commit 9d49a76d01
4 changed files with 44 additions and 9 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -162,6 +162,9 @@ Bug Fixes
 * LUCENE-6427: Added assertion about the presence of ghost bits in
  (Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)
 * LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary.
  (Jun Ohtani via Christian Moen)
 API Changes
 * LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
          .onMalformedInput(CodingErrorAction.REPORT)
          .onUnmappableCharacter(CodingErrorAction.REPORT);
      Reader reader = new InputStreamReader(stream, decoder);
-      userDictionary = new UserDictionary(reader);
+      userDictionary = UserDictionary.open(reader);
    } else {
      userDictionary = null;
    }
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@ -57,10 +57,10 @@ public final class UserDictionary implements Dictionary {
  public static final int RIGHT_ID = 5;
-  public UserDictionary(Reader reader) throws IOException {
+  public static UserDictionary open(Reader reader) throws IOException {
    BufferedReader br = new BufferedReader(reader);
    String line = null;
    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    List<String[]> featureEntries = new ArrayList<>();
    // text, segmentation, readings, POS
@ -76,6 +76,16 @@ public final class UserDictionary implements Dictionary {
      featureEntries.add(values);
    }
    if (featureEntries.isEmpty()) {
      return null;
    } else {
      return new UserDictionary(featureEntries);
    }
  }
  private UserDictionary(List<String[]> featureEntries) throws IOException {
    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
 import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.Random;
@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.UnicodeUtil;
-public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
+public class
    TestJapaneseTokenizer extends BaseTokenStreamTestCase {
  public static UserDictionary readDict() {
    InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
@ -49,7 +51,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
    try {
      try {
        Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
-        return new UserDictionary(reader);
+        return UserDictionary.open(reader);
      } finally {
        is.close();
      }
@ -686,4 +688,24 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
                              new int[] { 1, 1, 1, 1, 1},
                              new int[] { 1, 1, 1, 1, 1});
  }
  public void testEmptyUserDict() throws Exception {
    Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
    UserDictionary emptyDict = UserDictionary.open(emptyReader);
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
        return new TokenStreamComponents(tokenizer, tokenizer);
      }
    };
    assertAnalyzesTo(analyzer, "これは本ではない",
        new String[]{"これ", "は", "本", "で", "は", "ない"},
        new int[]{0, 2, 3, 4, 5, 6},
        new int[]{2, 3, 4, 5, 6, 8}
    );
    analyzer.close();
  }
 }