From 4ba78f2ab25442c026623b62dc27c60347fd99d6 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 29 Jan 2021 08:24:23 +0100 Subject: [PATCH] LUCENE-9706: Hunspell: support NEEDAFFIX flag on affixes (#2262) --- .../lucene/analysis/hunspell/Dictionary.java | 2 +- .../lucene/analysis/hunspell/Stemmer.java | 17 +++++++++++++++-- .../analysis/hunspell/SpellCheckerTest.java | 5 +++++ .../lucene/analysis/hunspell/needaffix5.aff | 13 +++++++++++++ .../lucene/analysis/hunspell/needaffix5.dic | 2 ++ .../lucene/analysis/hunspell/needaffix5.good | 11 +++++++++++ .../lucene/analysis/hunspell/needaffix5.wrong | 3 +++ 7 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.wrong diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index d9473a9c681..6d7638b3208 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -76,7 +76,7 @@ public class Dictionary { static final char[] NOFLAGS = new char[0]; - static final int FLAG_UNSET = 0; + static final char FLAG_UNSET = (char) 0; private static final int DEFAULT_FLAGS = 65510; private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 6b6fb80503a..572473c2ab7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -191,7 +191,7 @@ final class Stemmer { length, context, -1, - (char) 0, + Dictionary.FLAG_UNSET, -1, 0, true, @@ -361,6 +361,7 @@ final class Stemmer { pureAffix ? length - i : strippedWord.length, context, prefix, + previous, -1, recursionDepth, true, @@ -413,6 +414,7 @@ final class Stemmer { pureAffix ? i : strippedWord.length, context, suffix, + previous, prefixId, recursionDepth, false, @@ -543,6 +545,7 @@ final class Stemmer { int length, WordContext context, int affix, + int previousAffix, int prefixId, int recursionDepth, boolean prefix, @@ -553,7 +556,8 @@ final class Stemmer { List stems = new ArrayList<>(); - IntsRef forms = dictionary.lookupWord(strippedWord, offset, length); + boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix); + IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch); @@ -651,6 +655,15 @@ final class Stemmer { return stems; } + private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) { + if (isFlagAppendedByAffix(affix, dictionary.needaffix)) { + return !isSuffix + || previousAffix < 0 + || isFlagAppendedByAffix(previousAffix, dictionary.needaffix); + } + return false; + } + private boolean isFlagAppendedByAffix(int affixId, char flag) { if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false; int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index 30ceb582988..dbfbbec08c8 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -46,6 +46,11 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("i53643"); } + @Test + public void needAffixOnAffixes() throws Exception { + doTest("needaffix5"); + } + public void testBreak() throws Exception { doTest("break"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.aff new file mode 100644 index 00000000000..6399a3e98f7 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.aff @@ -0,0 +1,13 @@ +# on affixes +NEEDAFFIX X + +SFX A Y 2 +SFX A 0 suf/B . +SFX A 0 pseudosuf/XB . + +SFX B Y 1 +SFX B 0 bar . + +PFX C Y 2 +PFX C 0 pre . +PFX C 0 pseudopre/X . diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.dic new file mode 100644 index 00000000000..83131e27a58 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.dic @@ -0,0 +1,2 @@ +1 +foo/AC diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.good new file mode 100644 index 00000000000..d1b86bf8313 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.good @@ -0,0 +1,11 @@ +foo +prefoo +foosuf +prefoosuf +foosufbar +prefoosufbar +pseudoprefoosuf +pseudoprefoosufbar +pseudoprefoopseudosufbar +prefoopseudosuf +prefoopseudosufbar diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.wrong new file mode 100644 index 00000000000..fdd1797fdf8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/needaffix5.wrong @@ -0,0 +1,3 @@ +pseudoprefoo +foopseudosuf +pseudoprefoopseudosuf