Hunspell: reduce suggestion set dependency on the hash table order (#12239)

* Hunspell: reduce suggestion set dependency on the hash table order

When adding words to a dictionary, suggestions for other words shouldn't change unless they're directly related to the added words.
But before, GeneratingSuggester selected 100 best first matches from the hash table, whose order can change significantly after adding any unrelated word.
That resulted in unexpected suggestion changes on seemingly unrelated dictionary edits.
This commit is contained in:
Peter Gromov 2023-04-23 16:51:17 +02:00 committed by GitHub
parent 2e7426961b
commit 025dfec2dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 51 additions and 7 deletions

View File

@ -104,6 +104,8 @@ Other
* GITHUB#11960: Hunspell: supported empty dictionaries (Peter Gromov)
* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
======================== Lucene 9.6.0 =======================
API Changes

View File

@ -98,7 +98,7 @@ class GeneratingSuggester {
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
boolean overflow = roots.size() == MAX_ROOTS;
if (overflow && sc <= roots.peek().score) {
if (overflow && isWorseThan(sc, rootChars, roots.peek())) {
return;
}
@ -119,6 +119,11 @@ class GeneratingSuggester {
return roots.stream().sorted().collect(Collectors.toList());
}
private static boolean isWorseThan(int score, CharsRef candidate, Weighted<Root<String>> root) {
return score < root.score
|| score == root.score && CharSequence.compare(candidate, root.word.word) > 0;
}
private void processSuggestibleWords(
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
if (entryCache != null) {

View File

@ -65,13 +65,15 @@ public abstract class StemmerTestBase extends LuceneTestCase {
}
}
return loadDictionary(ignoreCase, affixStream, dictStreams);
}
protected static Dictionary loadDictionary(
boolean ignoreCase, InputStream affixStream, InputStream... dictStreams)
throws IOException, ParseException {
try {
return new Dictionary(
new ByteBuffersDirectory(),
"dictionary",
affixStream,
Arrays.asList(dictStreams),
ignoreCase);
ByteBuffersDirectory dir = new ByteBuffersDirectory();
return new Dictionary(dir, "dictionary", affixStream, Arrays.asList(dictStreams), ignoreCase);
} finally {
IOUtils.closeWhileHandlingException(affixStream);
IOUtils.closeWhileHandlingException(dictStreams);

View File

@ -21,8 +21,11 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@ -235,4 +238,36 @@ public class TestHunspell extends LuceneTestCase {
private void checkCompression(Hunspell h, String expected, String... words) {
assertEquals(expected, h.compress(List.of(words)).internalsToString());
}
@Test
public void testSuggestionOrderStabilityOnDictionaryEditing() throws IOException, ParseException {
String original = "some_word";
List<String> words = new ArrayList<>();
for (char c = 0; c < 65535; c++) {
if (Character.isLetter(c)) {
words.add(original + c);
}
}
String smallDict = "1\n" + String.join("\n", words.subList(0, words.size() / 4));
String largerDict = "1\n" + String.join("\n", words);
Dictionary small =
loadDictionary(
false,
new ByteArrayInputStream(new byte[0]),
new ByteArrayInputStream(smallDict.getBytes(StandardCharsets.UTF_8)));
Dictionary larger =
loadDictionary(
false,
new ByteArrayInputStream(new byte[0]),
new ByteArrayInputStream(largerDict.getBytes(StandardCharsets.UTF_8)));
assertFalse(new Hunspell(small).spell(original));
List<String> smallSug = new Hunspell(small).suggest(original);
List<String> largerSug = new Hunspell(larger).suggest(original);
assertEquals(smallSug.toString(), 4, smallSug.size());
assertEquals(smallSug, largerSug);
}
}