mirror of https://github.com/apache/lucene.git
Fix duplicate values in org.apache.lucene.analysis.ko.dict.UserDictionary (#13427)
Remove incorrect assertion in org.apache.lucene.analysis.ko.dict.UserDictionary, and replace with array copy if duplicate values are passed.
This commit is contained in:
parent
944624b644
commit
80304802a5
|
@ -23,6 +23,7 @@ import java.util.ArrayList;
|
|||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.analysis.morph.Dictionary;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
|
@ -131,10 +132,12 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
|
|||
lastToken = token;
|
||||
ord++;
|
||||
}
|
||||
if (entryIndex < rightIds.length) {
|
||||
rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex);
|
||||
}
|
||||
this.fst =
|
||||
new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
|
||||
int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
|
||||
assert entryIndex == rightIds.length;
|
||||
this.morphAtts = new UserMorphData(segmentations, rightIds);
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
@ -559,6 +560,22 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
|
|||
new POS.Tag[] {POS.Tag.SL});
|
||||
}
|
||||
|
||||
public void testDuplicate() throws IOException {
|
||||
String s = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시";
|
||||
try (Reader rulesReader = new StringReader(s)) {
|
||||
var dict = UserDictionary.open(rulesReader);
|
||||
assertTrue(dict.getRightId(3) != 0);
|
||||
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
|
||||
}
|
||||
|
||||
String dupdup = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시\n세종시 세종 시";
|
||||
try (Reader rulesReader = new StringReader(dupdup)) {
|
||||
var dict = UserDictionary.open(rulesReader);
|
||||
assertTrue(dict.getRightId(3) != 0);
|
||||
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
|
||||
}
|
||||
}
|
||||
|
||||
private void assertReadings(Analyzer analyzer, String input, String... readings)
|
||||
throws IOException {
|
||||
try (TokenStream ts = analyzer.tokenStream("ignored", input)) {
|
||||
|
|
Loading…
Reference in New Issue