diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fe22dabb62e..a7016627cd7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -104,6 +104,8 @@ Other * GITHUB#11960: Hunspell: supported empty dictionaries (Peter Gromov) +* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov) + ======================== Lucene 9.6.0 ======================= API Changes diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index dafb1494ea5..82d58080f9b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -98,7 +98,7 @@ class GeneratingSuggester { sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length); boolean overflow = roots.size() == MAX_ROOTS; - if (overflow && sc <= roots.peek().score) { + if (overflow && isWorseThan(sc, rootChars, roots.peek())) { return; } @@ -119,6 +119,11 @@ class GeneratingSuggester { return roots.stream().sorted().collect(Collectors.toList()); } + private static boolean isWorseThan(int score, CharsRef candidate, Weighted> root) { + return score < root.score + || score == root.score && CharSequence.compare(candidate, root.word.word) > 0; + } + private void processSuggestibleWords( int minLength, int maxLength, BiConsumer> processor) { if (entryCache != null) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java index bdffa5a3dad..d8020a3ee9f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java @@ -65,13 +65,15 @@ public abstract class StemmerTestBase extends LuceneTestCase { } } + return loadDictionary(ignoreCase, affixStream, dictStreams); + } + + protected static Dictionary loadDictionary( + boolean ignoreCase, InputStream affixStream, InputStream... dictStreams) + throws IOException, ParseException { try { - return new Dictionary( - new ByteBuffersDirectory(), - "dictionary", - affixStream, - Arrays.asList(dictStreams), - ignoreCase); + ByteBuffersDirectory dir = new ByteBuffersDirectory(); + return new Dictionary(dir, "dictionary", affixStream, Arrays.asList(dictStreams), ignoreCase); } finally { IOUtils.closeWhileHandlingException(affixStream); IOUtils.closeWhileHandlingException(dictStreams); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java index 8081b534fdb..f7170bc80e2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java @@ -21,8 +21,11 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT; import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT; import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.text.ParseException; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -235,4 +238,36 @@ public class TestHunspell extends LuceneTestCase { private void checkCompression(Hunspell h, String expected, String... words) { assertEquals(expected, h.compress(List.of(words)).internalsToString()); } + + @Test + public void testSuggestionOrderStabilityOnDictionaryEditing() throws IOException, ParseException { + String original = "some_word"; + + List words = new ArrayList<>(); + for (char c = 0; c < 65535; c++) { + if (Character.isLetter(c)) { + words.add(original + c); + } + } + + String smallDict = "1\n" + String.join("\n", words.subList(0, words.size() / 4)); + String largerDict = "1\n" + String.join("\n", words); + Dictionary small = + loadDictionary( + false, + new ByteArrayInputStream(new byte[0]), + new ByteArrayInputStream(smallDict.getBytes(StandardCharsets.UTF_8))); + Dictionary larger = + loadDictionary( + false, + new ByteArrayInputStream(new byte[0]), + new ByteArrayInputStream(largerDict.getBytes(StandardCharsets.UTF_8))); + + assertFalse(new Hunspell(small).spell(original)); + + List smallSug = new Hunspell(small).suggest(original); + List largerSug = new Hunspell(larger).suggest(original); + assertEquals(smallSug.toString(), 4, smallSug.size()); + assertEquals(smallSug, largerSug); + } }