mirror of https://github.com/apache/lucene.git
Fix duplicate values in org.apache.lucene.analysis.ko.dict.UserDictionary (#13427)
Remove incorrect assertion in org.apache.lucene.analysis.ko.dict.UserDictionary, and replace with array copy if duplicate values are passed.
This commit is contained in:
parent
944624b644
commit
80304802a5
|
@ -23,6 +23,7 @@ import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.lucene.analysis.morph.Dictionary;
|
import org.apache.lucene.analysis.morph.Dictionary;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FSTCompiler;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
|
@ -131,10 +132,12 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
||||||
lastToken = token;
|
lastToken = token;
|
||||||
ord++;
|
ord++;
|
||||||
}
|
}
|
||||||
|
if (entryIndex < rightIds.length) {
|
||||||
|
rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex);
|
||||||
|
}
|
||||||
this.fst =
|
this.fst =
|
||||||
new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
|
new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
|
||||||
int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
|
int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
|
||||||
assert entryIndex == rightIds.length;
|
|
||||||
this.morphAtts = new UserMorphData(segmentations, rightIds);
|
this.morphAtts = new UserMorphData(segmentations, rightIds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -559,6 +560,22 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
||||||
new POS.Tag[] {POS.Tag.SL});
|
new POS.Tag[] {POS.Tag.SL});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDuplicate() throws IOException {
|
||||||
|
String s = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시";
|
||||||
|
try (Reader rulesReader = new StringReader(s)) {
|
||||||
|
var dict = UserDictionary.open(rulesReader);
|
||||||
|
assertTrue(dict.getRightId(3) != 0);
|
||||||
|
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
|
||||||
|
}
|
||||||
|
|
||||||
|
String dupdup = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시\n세종시 세종 시";
|
||||||
|
try (Reader rulesReader = new StringReader(dupdup)) {
|
||||||
|
var dict = UserDictionary.open(rulesReader);
|
||||||
|
assertTrue(dict.getRightId(3) != 0);
|
||||||
|
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void assertReadings(Analyzer analyzer, String input, String... readings)
|
private void assertReadings(Analyzer analyzer, String input, String... readings)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||||
|
|
Loading…
Reference in New Issue