mirror of https://github.com/apache/lucene.git
Hunspell: reduce suggestion set dependency on the hash table order (#12239)
* Hunspell: reduce suggestion set dependency on the hash table order When adding words to a dictionary, suggestions for other words shouldn't change unless they're directly related to the added words. But before, GeneratingSuggester selected 100 best first matches from the hash table, whose order can change significantly after adding any unrelated word. That resulted in unexpected suggestion changes on seemingly unrelated dictionary edits.
This commit is contained in:
parent
2e7426961b
commit
025dfec2dd
|
@ -104,6 +104,8 @@ Other
|
|||
|
||||
* GITHUB#11960: Hunspell: supported empty dictionaries (Peter Gromov)
|
||||
|
||||
* GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov)
|
||||
|
||||
======================== Lucene 9.6.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -98,7 +98,7 @@ class GeneratingSuggester {
|
|||
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
||||
|
||||
boolean overflow = roots.size() == MAX_ROOTS;
|
||||
if (overflow && sc <= roots.peek().score) {
|
||||
if (overflow && isWorseThan(sc, rootChars, roots.peek())) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -119,6 +119,11 @@ class GeneratingSuggester {
|
|||
return roots.stream().sorted().collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static boolean isWorseThan(int score, CharsRef candidate, Weighted<Root<String>> root) {
|
||||
return score < root.score
|
||||
|| score == root.score && CharSequence.compare(candidate, root.word.word) > 0;
|
||||
}
|
||||
|
||||
private void processSuggestibleWords(
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
if (entryCache != null) {
|
||||
|
|
|
@ -65,13 +65,15 @@ public abstract class StemmerTestBase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
return loadDictionary(ignoreCase, affixStream, dictStreams);
|
||||
}
|
||||
|
||||
protected static Dictionary loadDictionary(
|
||||
boolean ignoreCase, InputStream affixStream, InputStream... dictStreams)
|
||||
throws IOException, ParseException {
|
||||
try {
|
||||
return new Dictionary(
|
||||
new ByteBuffersDirectory(),
|
||||
"dictionary",
|
||||
affixStream,
|
||||
Arrays.asList(dictStreams),
|
||||
ignoreCase);
|
||||
ByteBuffersDirectory dir = new ByteBuffersDirectory();
|
||||
return new Dictionary(dir, "dictionary", affixStream, Arrays.asList(dictStreams), ignoreCase);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(affixStream);
|
||||
IOUtils.closeWhileHandlingException(dictStreams);
|
||||
|
|
|
@ -21,8 +21,11 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
|
|||
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
|
||||
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -235,4 +238,36 @@ public class TestHunspell extends LuceneTestCase {
|
|||
private void checkCompression(Hunspell h, String expected, String... words) {
|
||||
assertEquals(expected, h.compress(List.of(words)).internalsToString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSuggestionOrderStabilityOnDictionaryEditing() throws IOException, ParseException {
|
||||
String original = "some_word";
|
||||
|
||||
List<String> words = new ArrayList<>();
|
||||
for (char c = 0; c < 65535; c++) {
|
||||
if (Character.isLetter(c)) {
|
||||
words.add(original + c);
|
||||
}
|
||||
}
|
||||
|
||||
String smallDict = "1\n" + String.join("\n", words.subList(0, words.size() / 4));
|
||||
String largerDict = "1\n" + String.join("\n", words);
|
||||
Dictionary small =
|
||||
loadDictionary(
|
||||
false,
|
||||
new ByteArrayInputStream(new byte[0]),
|
||||
new ByteArrayInputStream(smallDict.getBytes(StandardCharsets.UTF_8)));
|
||||
Dictionary larger =
|
||||
loadDictionary(
|
||||
false,
|
||||
new ByteArrayInputStream(new byte[0]),
|
||||
new ByteArrayInputStream(largerDict.getBytes(StandardCharsets.UTF_8)));
|
||||
|
||||
assertFalse(new Hunspell(small).spell(original));
|
||||
|
||||
List<String> smallSug = new Hunspell(small).suggest(original);
|
||||
List<String> largerSug = new Hunspell(larger).suggest(original);
|
||||
assertEquals(smallSug.toString(), 4, smallSug.size());
|
||||
assertEquals(smallSug, largerSug);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue