From 9d49a76d01a3bb639c2ed3cf977a5063b0971c94 Mon Sep 17 00:00:00 2001 From: Christian Moen Date: Mon, 11 May 2015 08:03:56 +0000 Subject: [PATCH] Fix for empty Kuromoji user dictionary NPE (LUCENE-6468) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678685 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 +++ .../analysis/ja/JapaneseTokenizerFactory.java | 2 +- .../analysis/ja/dict/UserDictionary.java | 22 +++++++++++----- .../analysis/ja/TestJapaneseTokenizer.java | 26 +++++++++++++++++-- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2c1e93ad901..0f373ba4235 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -162,6 +162,9 @@ Bug Fixes * LUCENE-6427: Added assertion about the presence of ghost bits in (Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand) +* LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary. + (Jun Ohtani via Christian Moen) + API Changes * LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java index ef164d57a4b..13a2de5fcbd 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java @@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); Reader reader = new InputStreamReader(stream, decoder); - userDictionary = new UserDictionary(reader); + userDictionary = UserDictionary.open(reader); } else { userDictionary = null; } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 41af027761f..3e72745120b 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -56,18 +56,18 @@ public final class UserDictionary implements Dictionary { public static final int LEFT_ID = 5; public static final int RIGHT_ID = 5; - - public UserDictionary(Reader reader) throws IOException { + + public static UserDictionary open(Reader reader) throws IOException { + BufferedReader br = new BufferedReader(reader); String line = null; - int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; List featureEntries = new ArrayList<>(); - + // text, segmentation, readings, POS while ((line = br.readLine()) != null) { // Remove comments line = line.replaceAll("#.*$", ""); - + // Skip empty lines or comment lines if (line.trim().length() == 0) { continue; @@ -75,7 +75,17 @@ public final class UserDictionary implements Dictionary { String[] values = CSVUtil.parse(line); featureEntries.add(values); } - + + if (featureEntries.isEmpty()) { + return null; + } else { + return new UserDictionary(featureEntries); + } + } + + private UserDictionary(List featureEntries) throws IOException { + + int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if it's needed/useful? diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java index 384f0a8f87c..ba172a00222 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java @@ -22,6 +22,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Reader; +import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.Random; @@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.UnicodeUtil; -public class TestJapaneseTokenizer extends BaseTokenStreamTestCase { +public class + TestJapaneseTokenizer extends BaseTokenStreamTestCase { public static UserDictionary readDict() { InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt"); @@ -49,7 +51,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase { try { try { Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8); - return new UserDictionary(reader); + return UserDictionary.open(reader); } finally { is.close(); } @@ -686,4 +688,24 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase { new int[] { 1, 1, 1, 1, 1}, new int[] { 1, 1, 1, 1, 1}); } + + public void testEmptyUserDict() throws Exception { + Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n"); + UserDictionary emptyDict = UserDictionary.open(emptyReader); + + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH); + return new TokenStreamComponents(tokenizer, tokenizer); + } + }; + + assertAnalyzesTo(analyzer, "これは本ではない", + new String[]{"これ", "は", "本", "で", "は", "ない"}, + new int[]{0, 2, 3, 4, 5, 6}, + new int[]{2, 3, 4, 5, 6, 8} + ); + analyzer.close(); + } }