mirror of https://github.com/apache/lucene.git
Fix for empty Kuromoji user dictionary NPE (LUCENE-6468)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678685 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
216fd8d46a
commit
9d49a76d01
|
@ -162,6 +162,9 @@ Bug Fixes
|
||||||
* LUCENE-6427: Added assertion about the presence of ghost bits in
|
* LUCENE-6427: Added assertion about the presence of ghost bits in
|
||||||
(Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)
|
(Fixed|Long)BitSet. (Luc Vanlerberghe via Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-6468: Fixed NPE with empty Kuromoji user dictionary.
|
||||||
|
(Jun Ohtani via Christian Moen)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
|
* LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
|
||||||
|
|
|
@ -91,7 +91,7 @@ public class JapaneseTokenizerFactory extends TokenizerFactory implements Resour
|
||||||
.onMalformedInput(CodingErrorAction.REPORT)
|
.onMalformedInput(CodingErrorAction.REPORT)
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||||
Reader reader = new InputStreamReader(stream, decoder);
|
Reader reader = new InputStreamReader(stream, decoder);
|
||||||
userDictionary = new UserDictionary(reader);
|
userDictionary = UserDictionary.open(reader);
|
||||||
} else {
|
} else {
|
||||||
userDictionary = null;
|
userDictionary = null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,10 +57,10 @@ public final class UserDictionary implements Dictionary {
|
||||||
|
|
||||||
public static final int RIGHT_ID = 5;
|
public static final int RIGHT_ID = 5;
|
||||||
|
|
||||||
public UserDictionary(Reader reader) throws IOException {
|
public static UserDictionary open(Reader reader) throws IOException {
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(reader);
|
BufferedReader br = new BufferedReader(reader);
|
||||||
String line = null;
|
String line = null;
|
||||||
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
|
|
||||||
List<String[]> featureEntries = new ArrayList<>();
|
List<String[]> featureEntries = new ArrayList<>();
|
||||||
|
|
||||||
// text, segmentation, readings, POS
|
// text, segmentation, readings, POS
|
||||||
|
@ -76,6 +76,16 @@ public final class UserDictionary implements Dictionary {
|
||||||
featureEntries.add(values);
|
featureEntries.add(values);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (featureEntries.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
return new UserDictionary(featureEntries);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private UserDictionary(List<String[]> featureEntries) throws IOException {
|
||||||
|
|
||||||
|
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
|
||||||
// TODO: should we allow multiple segmentations per input 'phrase'?
|
// TODO: should we allow multiple segmentations per input 'phrase'?
|
||||||
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?
|
// the old treemap didn't support this either, and i'm not sure if it's needed/useful?
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.LineNumberReader;
|
import java.io.LineNumberReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
|
@ -39,7 +40,8 @@ import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
public class
|
||||||
|
TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public static UserDictionary readDict() {
|
public static UserDictionary readDict() {
|
||||||
InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
|
InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
|
||||||
|
@ -49,7 +51,7 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
try {
|
try {
|
||||||
try {
|
try {
|
||||||
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
|
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
|
||||||
return new UserDictionary(reader);
|
return UserDictionary.open(reader);
|
||||||
} finally {
|
} finally {
|
||||||
is.close();
|
is.close();
|
||||||
}
|
}
|
||||||
|
@ -686,4 +688,24 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
new int[] { 1, 1, 1, 1, 1},
|
new int[] { 1, 1, 1, 1, 1},
|
||||||
new int[] { 1, 1, 1, 1, 1});
|
new int[] { 1, 1, 1, 1, 1});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEmptyUserDict() throws Exception {
|
||||||
|
Reader emptyReader = new StringReader("\n# This is an empty user dictionary\n\n");
|
||||||
|
UserDictionary emptyDict = UserDictionary.open(emptyReader);
|
||||||
|
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), emptyDict, false, Mode.SEARCH);
|
||||||
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, "これは本ではない",
|
||||||
|
new String[]{"これ", "は", "本", "で", "は", "ない"},
|
||||||
|
new int[]{0, 2, 3, 4, 5, 6},
|
||||||
|
new int[]{2, 3, 4, 5, 6, 8}
|
||||||
|
);
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue