From 9d49a76d01a3bb639c2ed3cf977a5063b0971c94 Mon Sep 17 00:00:00 2001
From: Christian Moen <cm@apache.org>
Date: Mon, 11 May 2015 08:03:56 +0000
Subject: [PATCH] Fix for empty Kuromoji user dictionary NPE (LUCENE-6468)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678685 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  3 +++
 .../analysis/ja/JapaneseTokenizerFactory.java |  2 +-
 .../analysis/ja/dict/UserDictionary.java      | 22 +++++++++++-----
 .../analysis/ja/TestJapaneseTokenizer.java    | 26 +++++++++++++++++--
 4 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 2c1e93ad901..0f373ba4235 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -162,6 +162,9 @@ Bug Fixes
 * LUCENE-6427: Added assertion about the presence of ghost bits in
   (Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)
 
+* LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary.
+  (Jun Ohtani via Christian Moen)
+
 API Changes
 
 * LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
index ef164d57a4b..13a2de5fcbd 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java
@@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
           .onMalformedInput(CodingErrorAction.REPORT)
           .onUnmappableCharacter(CodingErrorAction.REPORT);
       Reader reader = new InputStreamReader(stream, decoder);
-      userDictionary = new UserDictionary(reader);
+      userDictionary = UserDictionary.open(reader);
     } else {
       userDictionary = null;
     }
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
index 41af027761f..3e72745120b 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@@ -56,18 +56,18 @@ public final class UserDictionary implements Dictionary {
   public static final int LEFT_ID = 5;
   
   public static final int RIGHT_ID = 5;
-  
-  public UserDictionary(Reader reader) throws IOException {
+
+  public static UserDictionary open(Reader reader) throws IOException {
+
     BufferedReader br = new BufferedReader(reader);
     String line = null;
-    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
     List<String[]> featureEntries = new ArrayList<>();
- 
+
     // text, segmentation, readings, POS
     while ((line = br.readLine()) != null) {
       // Remove comments
       line = line.replaceAll("#.*$", "");
-      
+
       // Skip empty lines or comment lines
       if (line.trim().length() == 0) {
         continue;
@@ -75,7 +75,17 @@ public final class UserDictionary implements Dictionary {
       String[] values = CSVUtil.parse(line);
       featureEntries.add(values);
     }
-    
+
+    if (featureEntries.isEmpty()) {
+      return null;
+    } else {
+      return new UserDictionary(featureEntries);
+    }
+  }
+
+  private UserDictionary(List<String[]> featureEntries) throws IOException {
+
+    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
     // TODO: should we allow multiple segmentations per input 'phrase'?
     // the old treemap didn't support this either, and i'm not sure if it's needed/useful?
 
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
index 384f0a8f87c..ba172a00222 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
+import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.Random;
 
@@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.UnicodeUtil;
 
-public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
+public class
+    TestJapaneseTokenizer extends BaseTokenStreamTestCase {
 
   public static UserDictionary readDict() {
     InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
@@ -49,7 +51,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
     try {
       try {
         Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
-        return new UserDictionary(reader);
+        return UserDictionary.open(reader);
       } finally {
         is.close();
       }
@@ -686,4 +688,24 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
                               new int[] { 1, 1, 1, 1, 1},
                               new int[] { 1, 1, 1, 1, 1});
   }
+
+  public void testEmptyUserDict() throws Exception {
+    Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
+    UserDictionary emptyDict = UserDictionary.open(emptyReader);
+
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
+        return new TokenStreamComponents(tokenizer, tokenizer);
+      }
+    };
+
+    assertAnalyzesTo(analyzer, "これは本ではない",
+        new String[]{"これ", "は", "本", "で", "は", "ない"},
+        new int[]{0, 2, 3, 4, 5, 6},
+        new int[]{2, 3, 4, 5, 6, 8}
+    );
+    analyzer.close();
+  }
 }