From c61b458719909ab9a1e395d3985f7f9cd50f3390 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Tue, 23 Feb 2021 12:58:22 +0100 Subject: [PATCH] LUCENE-9804: Hunspell: fix most similar dictionary entry search by reversing the comparator (#2419) --- .../hunspell/GeneratingSuggester.java | 4 +- .../analysis/hunspell/TestSpellChecking.java | 4 + .../apache/lucene/analysis/hunspell/ngram.aff | 0 .../apache/lucene/analysis/hunspell/ngram.dic | 202 ++++++++++++++++++ .../apache/lucene/analysis/hunspell/ngram.sug | 1 + .../lucene/analysis/hunspell/ngram.wrong | 1 + 6 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.sug create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.wrong diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 500ae155cee..c1bbdf9e555 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -22,6 +22,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_STRIP_ORD; import java.io.IOException; import java.util.ArrayList; +import java.util.Comparator; import java.util.EnumSet; import java.util.LinkedHashSet; import java.util.List; @@ -60,7 +61,8 @@ class GeneratingSuggester { private List>> findSimilarDictionaryEntries( String word, WordCase originalCase) { - PriorityQueue>> roots = new PriorityQueue<>(); + Comparator>> natural = Comparator.naturalOrder(); + PriorityQueue>> roots = new PriorityQueue<>(natural.reversed()); processFST( dictionary.words, (key, forms) -> { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java index 77375315911..c1312370a74 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java @@ -201,6 +201,10 @@ public class TestSpellChecking extends LuceneTestCase { doTest("sug2"); } + public void testGeneratedSuggestions() throws Exception { + doTest("ngram"); + } + public void testMaxNGramSugsDefaultIsNotUnlimited() throws Exception { doTest("maxNGramSugsDefault"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.aff new file mode 100644 index 00000000000..e69de29bb2d diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.dic new file mode 100644 index 00000000000..69f15a87724 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.dic @@ -0,0 +1,202 @@ +100 +A +AA +AAA +AB +ABA +ABC +ABM +ABS +AC +ACLU +ACT +ACTH +AD +ADC +ADD +ADM +ADP +AF +AFAIK +AFB +AFC +AFDC +AFN +AFT +AI +AIDS +AK +AL +AM +AMA +AMD +ANSI +ANZUS +AOL +AP +APB +APC +API +APO +APR +AR +ARC +ASAP +ASCII +ASL +ASPCA +ATM +ATP +ATV +AV +AVI +AWACS +AWOL +AWS +AZ +AZT +Aachen +Aaliyah +Aaron +Abbas +Abbasid +Abbott +Abby +Abdul +Abe +Abel +Abelard +Abelson +Aberdeen +Abernathy +Abidjan +Abigail +Abilene +Abner +Aborigine +Abraham +Abram +Abrams +Absalom +Abuja +Abyssinia +Abyssinian +Ac +Acadia +Acapulco +Accenture +Accra +Acevedo +Achaean +Achebe +Achernar +Acheson +Achilles +Aconcagua +Acosta +Acropolis +Acrux +Actaeon +Acton +Acts +Acuff +thermostat +squeaker +Theron +heather +taker +Thermos +thinker +theorist +theorize +theatrics +therapeutic +lawbreaker +Northeast +weather +tiebreaker +their +therm +there +therefor +theta +theoretic +thereunder +Theiler +therapist +thematic +therewith +icebreaker +Thespian +sneaker +theater +breaker +speaker +Heather +Whitaker +toolmaker +Dorothea +Thermopylae +thereto +theocracy +thereby +ethereal +theremin +caretaker +thereat +Theravada +threadlike +therein +thereafter +thereupon +streaker +thereof +they're +thereon +jawbreaker +shoemaker +shaker +threader +Thackeray +thermionic +heartbreak +therapy +thesauri +feathery +theatricals +takeover +leather +thespian +thunderhead +Katheryn +thereunto +thereabout +feather +Shaker +therefrom +Thea +leathery +beaker +therefore +thesaurus +homemaker +theory +theorem +theocratic +Therese +Theresa +Theodore +Theodora +Theodoric +teakettle +Thatcher +theatergoer +Katherine +watchmaker +theatrical +haymaker +breather +thither +thwacker +thermal +thermos \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.sug new file mode 100644 index 00000000000..4e7b10c6d71 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.sug @@ -0,0 +1 @@ +Theater, Heather, Thereat \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.wrong new file mode 100644 index 00000000000..2ce830e2e02 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/ngram.wrong @@ -0,0 +1 @@ +Theaker \ No newline at end of file