mirror of https://github.com/apache/lucene.git
Fix for empty Kuromoji user dictionary NPE (LUCENE-6468)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678685 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
216fd8d46a
commit
9d49a76d01
|
@ -162,6 +162,9 @@ Bug Fixes
|
|||
* LUCENE-6427: Added assertion about the presence of ghost bits in
|
||||
(Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)
|
||||
|
||||
* LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary.
|
||||
(Jun Ohtani via Christian Moen)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
|
||||
|
|
|
@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
|
|||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
Reader reader = new InputStreamReader(stream, decoder);
|
||||
userDictionary = new UserDictionary(reader);
|
||||
userDictionary = UserDictionary.open(reader);
|
||||
} else {
|
||||
userDictionary = null;
|
||||
}
|
||||
|
|
|
@ -56,18 +56,18 @@ public final class UserDictionary implements Dictionary {
|
|||
public static final int LEFT_ID = 5;
|
||||
|
||||
public static final int RIGHT_ID = 5;
|
||||
|
||||
public UserDictionary(Reader reader) throws IOException {
|
||||
|
||||
public static UserDictionary open(Reader reader) throws IOException {
|
||||
|
||||
BufferedReader br = new BufferedReader(reader);
|
||||
String line = null;
|
||||
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
|
||||
List<String[]> featureEntries = new ArrayList<>();
|
||||
|
||||
|
||||
// text, segmentation, readings, POS
|
||||
while ((line = br.readLine()) != null) {
|
||||
// Remove comments
|
||||
line = line.replaceAll("#.*$", "");
|
||||
|
||||
|
||||
// Skip empty lines or comment lines
|
||||
if (line.trim().length() == 0) {
|
||||
continue;
|
||||
|
@ -75,7 +75,17 @@ public final class UserDictionary implements Dictionary {
|
|||
String[] values = CSVUtil.parse(line);
|
||||
featureEntries.add(values);
|
||||
}
|
||||
|
||||
|
||||
if (featureEntries.isEmpty()) {
|
||||
return null;
|
||||
} else {
|
||||
return new UserDictionary(featureEntries);
|
||||
}
|
||||
}
|
||||
|
||||
private UserDictionary(List<String[]> featureEntries) throws IOException {
|
||||
|
||||
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
|
||||
// TODO: should we allow multiple segmentations per input 'phrase'?
|
||||
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.InputStream;
|
|||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Random;
|
||||
|
||||
|
@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||
public class
|
||||
TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public static UserDictionary readDict() {
|
||||
InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
|
||||
|
@ -49,7 +51,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
try {
|
||||
try {
|
||||
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
|
||||
return new UserDictionary(reader);
|
||||
return UserDictionary.open(reader);
|
||||
} finally {
|
||||
is.close();
|
||||
}
|
||||
|
@ -686,4 +688,24 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 1, 1, 1, 1},
|
||||
new int[] { 1, 1, 1, 1, 1});
|
||||
}
|
||||
|
||||
public void testEmptyUserDict() throws Exception {
|
||||
Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
|
||||
UserDictionary emptyDict = UserDictionary.open(emptyReader);
|
||||
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(analyzer, "これは本ではない",
|
||||
new String[]{"これ", "は", "本", "で", "は", "ない"},
|
||||
new int[]{0, 2, 3, 4, 5, 6},
|
||||
new int[]{2, 3, 4, 5, 6, 8}
|
||||
);
|
||||
analyzer.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue