diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 3acf59ef951..59536fe4205 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -1407,14 +1407,6 @@ public class Dictionary { .collect(Collectors.toList()); } - boolean isForbiddenWord(char[] word, int length) { - if (forbiddenword != FLAG_UNSET) { - IntsRef forms = lookupWord(word, 0, length); - return forms != null && hasFlag(forms, forbiddenword); - } - return false; - } - boolean hasFlag(IntsRef forms, char flag) { int formStep = formStep(); for (int i = 0; i < forms.length; i += formStep) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java index 35eae7b9165..54a42819fbd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java @@ -51,22 +51,22 @@ class GeneratingSuggester { } List suggest(String word, WordCase originalCase, Set prevSuggestions) { - List> roots = findSimilarDictionaryEntries(word, originalCase); + List>> roots = findSimilarDictionaryEntries(word, originalCase); List> expanded = expandRoots(word, roots); TreeSet> bySimilarity = rankBySimilarity(word, expanded); return getMostRelevantSuggestions(bySimilarity, prevSuggestions); } - private List> findSimilarDictionaryEntries( + private List>> findSimilarDictionaryEntries( String word, WordCase originalCase) { - TreeSet> roots = new TreeSet<>(); + TreeSet>> roots = new TreeSet<>(); processFST( dictionary.words, (key, forms) -> { if (Math.abs(key.length - word.length()) > 4) return; String root = toString(key); - List entries = filterSuitableEntries(root, forms); + List> entries = filterSuitableEntries(root, forms); if (entries.isEmpty()) return; if (originalCase == WordCase.LOWER @@ -106,8 +106,8 @@ class GeneratingSuggester { return new String(chars); } - private List filterSuitableEntries(String word, IntsRef forms) { - List result = new ArrayList<>(); + private List> filterSuitableEntries(String word, IntsRef forms) { + List> result = new ArrayList<>(); for (int i = 0; i < forms.length; i += dictionary.formStep()) { int entryId = forms.ints[forms.offset + i]; if (dictionary.hasFlag(entryId, dictionary.forbiddenword) @@ -116,17 +116,18 @@ class GeneratingSuggester { || dictionary.hasFlag(entryId, dictionary.onlyincompound)) { continue; } - result.add(new DictEntry(word, entryId)); + result.add(new Root<>(word, entryId)); } return result; } - private List> expandRoots(String misspelled, List> roots) { + private List> expandRoots( + String misspelled, List>> roots) { int thresh = calcThreshold(misspelled); TreeSet> expanded = new TreeSet<>(); - for (Weighted weighted : roots) { + for (Weighted> weighted : roots) { for (String guess : expandRoot(weighted.word, misspelled)) { String lower = dictionary.toLowerCase(guess); int sc = @@ -156,7 +157,7 @@ class GeneratingSuggester { return thresh / 3 - 1; } - private List expandRoot(DictEntry root, String misspelled) { + private List expandRoot(Root root, String misspelled) { List crossProducts = new ArrayList<>(); Set result = new LinkedHashSet<>(); @@ -226,7 +227,7 @@ class GeneratingSuggester { return result.stream().limit(MAX_WORDS).collect(Collectors.toList()); } - private boolean hasCompatibleFlags(DictEntry root, int affixId) { + private boolean hasCompatibleFlags(Root root, int affixId) { if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) { return false; } @@ -434,37 +435,4 @@ class GeneratingSuggester { return cmp != 0 ? -cmp : word.compareTo(o.word); } } - - private static class DictEntry implements Comparable { - private final String word; - private final int entryId; - - DictEntry(String word, int entryId) { - this.word = word; - this.entryId = entryId; - } - - @Override - public String toString() { - return word; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof DictEntry)) return false; - DictEntry dictEntry = (DictEntry) o; - return entryId == dictEntry.entryId && word.equals(dictEntry.word); - } - - @Override - public int hashCode() { - return Objects.hash(word, entryId); - } - - @Override - public int compareTo(DictEntry o) { - return word.compareTo(o.word); - } - } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java new file mode 100644 index 00000000000..e65992e6f67 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Root.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.util.Objects; + +class Root implements Comparable> { + final T word; + final int entryId; + + Root(T word, int entryId) { + this.word = word; + this.entryId = entryId; + } + + @Override + public String toString() { + return word.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Root)) return false; + @SuppressWarnings("unchecked") + Root root = (Root) o; + return entryId == root.entryId && word.equals(root.word); + } + + @Override + public int hashCode() { + return Objects.hash(word, entryId); + } + + @Override + public int compareTo(Root o) { + return CharSequence.compare(word, o.word); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index 11c05406d07..a24f49b2fc1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -68,11 +68,12 @@ public class SpellChecker { } char[] wordChars = word.toCharArray(); - if (dictionary.isForbiddenWord(wordChars, wordChars.length)) { - return false; + Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null); + if (simpleResult != null) { + return simpleResult; } - if (checkWord(wordChars, wordChars.length, null)) { + if (checkCompounds(wordChars, wordChars.length, null)) { return true; } @@ -105,12 +106,9 @@ public class SpellChecker { } Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) { - if (dictionary.isForbiddenWord(wordChars, length)) { - return false; - } - - if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) { - return true; + Root entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD); + if (entry != null) { + return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword); } return null; @@ -122,6 +120,10 @@ public class SpellChecker { return simpleResult; } + return checkCompounds(wordChars, length, originalCase); + } + + private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) { if (dictionary.compoundRules != null && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { return true; @@ -134,9 +136,10 @@ public class SpellChecker { return false; } - private CharsRef findStem( + private Root findStem( char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { - CharsRef[] result = {null}; + @SuppressWarnings({"rawtypes", "unchecked"}) + Root[] result = new Root[1]; stemmer.doStem( wordChars, offset, @@ -145,7 +148,7 @@ public class SpellChecker { context, (stem, formID, stemException) -> { if (acceptsStem(formID)) { - result[0] = stem; + result[0] = new Root<>(stem, formID); } return false; }); @@ -164,13 +167,15 @@ public class SpellChecker { WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakOffset = word.offset + breakPos; if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { - CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context); + Root stem = findStem(word.chars, word.offset, breakPos, originalCase, context); if (stem == null && dictionary.simplifiedTriple && word.chars[breakOffset - 1] == word.chars[breakOffset]) { stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context); } - if (stem != null && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) { + if (stem != null + && !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword) + && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) { CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null); if (checkCompoundsAfter(originalCase, part)) { return true; @@ -193,7 +198,8 @@ public class SpellChecker { if (expanded != null) { WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; int breakPos = pos + pattern.endLength(); - CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context); + Root stem = + findStem(expanded.chars, expanded.offset, breakPos, originalCase, context); if (stem != null) { CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern); if (checkCompoundsAfter(originalCase, part)) { @@ -210,10 +216,11 @@ public class SpellChecker { int breakPos = prev.length; int remainingLength = word.length - breakPos; int breakOffset = word.offset + breakPos; - CharsRef tailStem = + Root tailStem = findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); if (tailStem != null - && !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem)) + && !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword) + && !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word)) && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase) && prev.mayCompound(tailStem, remainingLength, originalCase)) { return true; @@ -232,7 +239,7 @@ public class SpellChecker { return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase); } - private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) { + private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) { return cr1.toString().equalsIgnoreCase(cr2.toString()); } @@ -243,11 +250,15 @@ public class SpellChecker { final CheckCompoundPattern enablingPattern; CompoundPart( - CompoundPart prev, CharsRef tail, int length, CharsRef stem, CheckCompoundPattern enabler) { + CompoundPart prev, + CharsRef tail, + int length, + Root stem, + CheckCompoundPattern enabler) { this.prev = prev; this.tail = tail; this.length = length; - this.stem = stem; + this.stem = stem.word; index = prev == null ? 1 : prev.index + 1; enablingPattern = enabler; } @@ -257,12 +268,12 @@ public class SpellChecker { return (prev == null ? "" : prev + "+") + tail.subSequence(0, length); } - boolean mayCompound(CharsRef nextStem, int nextPartLength, WordCase originalCase) { + boolean mayCompound(Root nextStem, int nextPartLength, WordCase originalCase) { boolean patternsOk = enablingPattern != null - ? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem) + ? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word) : dictionary.checkCompoundPatterns.stream() - .noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem)); + .noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word)); if (!patternsOk) { return false; } @@ -498,7 +509,7 @@ public class SpellChecker { if (!spell(chunk)) { for (String chunkSug : suggest(chunk)) { String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd); - if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length())) { + if (spell(replaced)) { result.add(replaced); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 64b649d8cc3..8afd9fc5d85 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -94,10 +93,6 @@ final class Stemmer { word = scratchBuffer; } - if (dictionary.isForbiddenWord(word, length)) { - return Collections.emptyList(); - } - List list = new ArrayList<>(); RootProcessor processor = (stem, formID, stemException) -> { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index 4dfcb61774f..8b75c018863 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -172,6 +172,18 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("onlyincompound2"); } + public void testForbiddenWord() throws Exception { + doTest("forbiddenword"); + } + + public void testForbiddenWord1() throws Exception { + doTest("opentaal_forbiddenword1"); + } + + public void testForbiddenWord2() throws Exception { + doTest("opentaal_forbiddenword2"); + } + public void testGermanCompounding() throws Exception { doTest("germancompounding"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java index 58477d84f12..dc4b897dae3 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDutchIJ.java @@ -27,6 +27,5 @@ public class TestDutchIJ extends StemmerTestBase { public void testStemming() { assertStemsTo("ijs", "ijs"); assertStemsTo("IJs", "ijs"); - assertStemsTo("Ijs"); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff new file mode 100644 index 00000000000..de7f8ad9a42 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.aff @@ -0,0 +1,11 @@ +# FORBIDDENWORD flag +# The signed word, and its suffixed forms are all forbidden, +# excepts with root homonyms. +# Useful for forbidding bad suffixed forms or compounds. + + +FORBIDDENWORD X +COMPOUNDFLAG Y + +SFX A Y 1 +SFX A 0 s . diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic new file mode 100644 index 00000000000..cb63592276a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.dic @@ -0,0 +1,11 @@ +10 +foo/S +foo/YX +bar/YS +bars/X +foos/X +kg +Kg/X +KG/X +cm +Cm/X \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good new file mode 100644 index 00000000000..7bd112e9ea6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.good @@ -0,0 +1,3 @@ +foo +bar + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong new file mode 100644 index 00000000000..5752c1e4462 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forbiddenword.wrong @@ -0,0 +1,4 @@ +bars +foos +foobar +barfoo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.aff new file mode 100644 index 00000000000..fa073432f5a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.aff @@ -0,0 +1,9 @@ +TRY r + +FORBIDDENWORD F +COMPOUNDRULE 2 +COMPOUNDRULE WW +COMPOUNDRULE WWW + +SFX S Y 1 +SFX S 0 s . diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.dic new file mode 100644 index 00000000000..44375948ff4 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.dic @@ -0,0 +1,5 @@ +4 +foo/W +word/W +bar/WS +foowordbar/FS diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.good new file mode 100644 index 00000000000..73a96a78451 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.good @@ -0,0 +1,3 @@ +fooword +wordbar +barwordfoo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.sug new file mode 100644 index 00000000000..60111a417ff --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.sug @@ -0,0 +1 @@ +barwordfoo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.wrong new file mode 100644 index 00000000000..59dfddfb246 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword1.wrong @@ -0,0 +1,5 @@ +foowordbar +foowordbars +foowordba +foowordbas +barwodfoo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.aff new file mode 100644 index 00000000000..441354d6b0d --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.aff @@ -0,0 +1,7 @@ +TRY r + +FORBIDDENWORD F +COMPOUNDFLAG W + +SFX S Y 1 +SFX S 0 s . diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.dic new file mode 100644 index 00000000000..895dd623052 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.dic @@ -0,0 +1,5 @@ +3 +foo/WS +word/W +bar/WS +foowordbar/FS \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.good new file mode 100644 index 00000000000..17cf47de3d5 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.good @@ -0,0 +1,4 @@ +fooword +wordbar +barwordfoo +barwordfoos diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.sug b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.sug new file mode 100644 index 00000000000..60111a417ff --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.sug @@ -0,0 +1 @@ +barwordfoo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.wrong new file mode 100644 index 00000000000..59dfddfb246 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/opentaal_forbiddenword2.wrong @@ -0,0 +1,5 @@ +foowordbar +foowordbars +foowordba +foowordbas +barwodfoo