Fix duplicate values in org.apache.lucene.analysis.ko.dict.UserDictionary (#13427)

Remove incorrect assertion in org.apache.lucene.analysis.ko.dict.UserDictionary, and replace with array copy if duplicate values are passed.
This commit is contained in:
Chris Hegarty 2024-05-27 17:53:01 +01:00 committed by GitHub
parent 944624b644
commit 80304802a5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 21 additions and 1 deletions

View File

@ -23,6 +23,7 @@ import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.FSTCompiler;
@ -131,10 +132,12 @@ public final class UserDictionary implements Dictionary<UserMorphData> {
lastToken = token; lastToken = token;
ord++; ord++;
} }
if (entryIndex < rightIds.length) {
rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex);
}
this.fst = this.fst =
new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader())); new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()));
int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]); int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]);
assert entryIndex == rightIds.length;
this.morphAtts = new UserMorphData(segmentations, rightIds); this.morphAtts = new UserMorphData(segmentations, rightIds);
} }

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Random; import java.util.Random;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
@ -559,6 +560,22 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
new POS.Tag[] {POS.Tag.SL}); new POS.Tag[] {POS.Tag.SL});
} }
public void testDuplicate() throws IOException {
String s = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시";
try (Reader rulesReader = new StringReader(s)) {
var dict = UserDictionary.open(rulesReader);
assertTrue(dict.getRightId(3) != 0);
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
}
String dupdup = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시\n세종시 세종 시";
try (Reader rulesReader = new StringReader(dupdup)) {
var dict = UserDictionary.open(rulesReader);
assertTrue(dict.getRightId(3) != 0);
assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4));
}
}
private void assertReadings(Analyzer analyzer, String input, String... readings) private void assertReadings(Analyzer analyzer, String input, String... readings)
throws IOException { throws IOException {
try (TokenStream ts = analyzer.tokenStream("ignored", input)) { try (TokenStream ts = analyzer.tokenStream("ignored", input)) {