diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java index ad8c1e3a772..00767e0f8dd 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java @@ -23,6 +23,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.morph.Dictionary; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; @@ -131,10 +132,12 @@ public final class UserDictionary implements Dictionary { lastToken = token; ord++; } + if (entryIndex < rightIds.length) { + rightIds = ArrayUtil.copyOfSubArray(rightIds, 0, entryIndex); + } this.fst = new TokenInfoFST(FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader())); int[][] segmentations = _segmentations.toArray(new int[_segmentations.size()][]); - assert entryIndex == rightIds.length; this.morphAtts = new UserMorphData(segmentations, rightIds); } diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java index 9511ff1b1da..6ce60a53e3f 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; +import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.Random; import org.apache.lucene.analysis.Analyzer; @@ -559,6 +560,22 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase { new POS.Tag[] {POS.Tag.SL}); } + public void testDuplicate() throws IOException { + String s = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시"; + try (Reader rulesReader = new StringReader(s)) { + var dict = UserDictionary.open(rulesReader); + assertTrue(dict.getRightId(3) != 0); + assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4)); + } + + String dupdup = "c++\nC쁠쁠\n세종\n세종\n세종시 세종 시\n세종시 세종 시"; + try (Reader rulesReader = new StringReader(dupdup)) { + var dict = UserDictionary.open(rulesReader); + assertTrue(dict.getRightId(3) != 0); + assertThrows(ArrayIndexOutOfBoundsException.class, () -> dict.getRightId(4)); + } + } + private void assertReadings(Analyzer analyzer, String input, String... readings) throws IOException { try (TokenStream ts = analyzer.tokenStream("ignored", input)) {