Fix for empty Kuromoji user dictionary NPE (LUCENE-6468)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678685 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christian Moen 2015-05-11 08:03:56 +00:00
parent 216fd8d46a
commit 9d49a76d01
4 changed files with 44 additions and 9 deletions

View File

@ -162,6 +162,9 @@ Bug Fixes
* LUCENE-6427: Added assertion about the presence of ghost bits in
(Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)
* LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary.
(Jun Ohtani via Christian Moen)
API Changes
* LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader

View File

@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader reader = new InputStreamReader(stream, decoder);
userDictionary = new UserDictionary(reader);
userDictionary = UserDictionary.open(reader);
} else {
userDictionary = null;
}

View File

@ -56,18 +56,18 @@ public final class UserDictionary implements Dictionary {
public static final int LEFT_ID = 5;
public static final int RIGHT_ID = 5;
public UserDictionary(Reader reader) throws IOException {
public static UserDictionary open(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader);
String line = null;
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
List<String[]> featureEntries = new ArrayList<>();
// text, segmentation, readings, POS
while ((line = br.readLine()) != null) {
// Remove comments
line = line.replaceAll("#.*$", "");
// Skip empty lines or comment lines
if (line.trim().length() == 0) {
continue;
@ -75,7 +75,17 @@ public final class UserDictionary implements Dictionary {
String[] values = CSVUtil.parse(line);
featureEntries.add(values);
}
if (featureEntries.isEmpty()) {
return null;
} else {
return new UserDictionary(featureEntries);
}
}
private UserDictionary(List<String[]> featureEntries) throws IOException {
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
// TODO: should we allow multiple segmentations per input 'phrase'?
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?

View File

@ -22,6 +22,7 @@ import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.Random;
@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
public class
TestJapaneseTokenizer extends BaseTokenStreamTestCase {
public static UserDictionary readDict() {
InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
@ -49,7 +51,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
try {
try {
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
return new UserDictionary(reader);
return UserDictionary.open(reader);
} finally {
is.close();
}
@ -686,4 +688,24 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
new int[] { 1, 1, 1, 1, 1},
new int[] { 1, 1, 1, 1, 1});
}
public void testEmptyUserDict() throws Exception {
Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
UserDictionary emptyDict = UserDictionary.open(emptyReader);
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
assertAnalyzesTo(analyzer, "これは本ではない",
new String[]{"これ", "", "", "", "", "ない"},
new int[]{0, 2, 3, 4, 5, 6},
new int[]{2, 3, 4, 5, 6, 8}
);
analyzer.close();
}
}