LUCENE-9763: Hunspell: fix FORBIDDENWORD support (#2351)

don't decompound if it's a simple word with a forbidden root, don't lookup the word twice, don't forbid stemming (be like Hunspell)
This commit is contained in:
Peter Gromov 2021-02-11 15:16:40 +01:00 committed by GitHub
parent 01e34f8723
commit 019872453d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 186 additions and 82 deletions

View File

@ -1407,14 +1407,6 @@ public class Dictionary {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
boolean isForbiddenWord(char[] word, int length) {
if (forbiddenword != FLAG_UNSET) {
IntsRef forms = lookupWord(word, 0, length);
return forms != null && hasFlag(forms, forbiddenword);
}
return false;
}
boolean hasFlag(IntsRef forms, char flag) { boolean hasFlag(IntsRef forms, char flag) {
int formStep = formStep(); int formStep = formStep();
for (int i = 0; i < forms.length; i += formStep) { for (int i = 0; i < forms.length; i += formStep) {

View File

@ -51,22 +51,22 @@ class GeneratingSuggester {
} }
List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) { List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) {
List<Weighted<DictEntry>> roots = findSimilarDictionaryEntries(word, originalCase); List<Weighted<Root<String>>> roots = findSimilarDictionaryEntries(word, originalCase);
List<Weighted<String>> expanded = expandRoots(word, roots); List<Weighted<String>> expanded = expandRoots(word, roots);
TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded); TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded);
return getMostRelevantSuggestions(bySimilarity, prevSuggestions); return getMostRelevantSuggestions(bySimilarity, prevSuggestions);
} }
private List<Weighted<DictEntry>> findSimilarDictionaryEntries( private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
String word, WordCase originalCase) { String word, WordCase originalCase) {
TreeSet<Weighted<DictEntry>> roots = new TreeSet<>(); TreeSet<Weighted<Root<String>>> roots = new TreeSet<>();
processFST( processFST(
dictionary.words, dictionary.words,
(key, forms) -> { (key, forms) -> {
if (Math.abs(key.length - word.length()) > 4) return; if (Math.abs(key.length - word.length()) > 4) return;
String root = toString(key); String root = toString(key);
List<DictEntry> entries = filterSuitableEntries(root, forms); List<Root<String>> entries = filterSuitableEntries(root, forms);
if (entries.isEmpty()) return; if (entries.isEmpty()) return;
if (originalCase == WordCase.LOWER if (originalCase == WordCase.LOWER
@ -106,8 +106,8 @@ class GeneratingSuggester {
return new String(chars); return new String(chars);
} }
private List<DictEntry> filterSuitableEntries(String word, IntsRef forms) { private List<Root<String>> filterSuitableEntries(String word, IntsRef forms) {
List<DictEntry> result = new ArrayList<>(); List<Root<String>> result = new ArrayList<>();
for (int i = 0; i < forms.length; i += dictionary.formStep()) { for (int i = 0; i < forms.length; i += dictionary.formStep()) {
int entryId = forms.ints[forms.offset + i]; int entryId = forms.ints[forms.offset + i];
if (dictionary.hasFlag(entryId, dictionary.forbiddenword) if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
@ -116,17 +116,18 @@ class GeneratingSuggester {
|| dictionary.hasFlag(entryId, dictionary.onlyincompound)) { || dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
continue; continue;
} }
result.add(new DictEntry(word, entryId)); result.add(new Root<>(word, entryId));
} }
return result; return result;
} }
private List<Weighted<String>> expandRoots(String misspelled, List<Weighted<DictEntry>> roots) { private List<Weighted<String>> expandRoots(
String misspelled, List<Weighted<Root<String>>> roots) {
int thresh = calcThreshold(misspelled); int thresh = calcThreshold(misspelled);
TreeSet<Weighted<String>> expanded = new TreeSet<>(); TreeSet<Weighted<String>> expanded = new TreeSet<>();
for (Weighted<DictEntry> weighted : roots) { for (Weighted<Root<String>> weighted : roots) {
for (String guess : expandRoot(weighted.word, misspelled)) { for (String guess : expandRoot(weighted.word, misspelled)) {
String lower = dictionary.toLowerCase(guess); String lower = dictionary.toLowerCase(guess);
int sc = int sc =
@ -156,7 +157,7 @@ class GeneratingSuggester {
return thresh / 3 - 1; return thresh / 3 - 1;
} }
private List<String> expandRoot(DictEntry root, String misspelled) { private List<String> expandRoot(Root<String> root, String misspelled) {
List<String> crossProducts = new ArrayList<>(); List<String> crossProducts = new ArrayList<>();
Set<String> result = new LinkedHashSet<>(); Set<String> result = new LinkedHashSet<>();
@ -226,7 +227,7 @@ class GeneratingSuggester {
return result.stream().limit(MAX_WORDS).collect(Collectors.toList()); return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
} }
private boolean hasCompatibleFlags(DictEntry root, int affixId) { private boolean hasCompatibleFlags(Root<?> root, int affixId) {
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) { if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
return false; return false;
} }
@ -434,37 +435,4 @@ class GeneratingSuggester {
return cmp != 0 ? -cmp : word.compareTo(o.word); return cmp != 0 ? -cmp : word.compareTo(o.word);
} }
} }
private static class DictEntry implements Comparable<DictEntry> {
private final String word;
private final int entryId;
DictEntry(String word, int entryId) {
this.word = word;
this.entryId = entryId;
}
@Override
public String toString() {
return word;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof DictEntry)) return false;
DictEntry dictEntry = (DictEntry) o;
return entryId == dictEntry.entryId && word.equals(dictEntry.word);
}
@Override
public int hashCode() {
return Objects.hash(word, entryId);
}
@Override
public int compareTo(DictEntry o) {
return word.compareTo(o.word);
}
}
} }

View File

@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.util.Objects;
class Root<T extends CharSequence> implements Comparable<Root<T>> {
final T word;
final int entryId;
Root(T word, int entryId) {
this.word = word;
this.entryId = entryId;
}
@Override
public String toString() {
return word.toString();
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Root)) return false;
@SuppressWarnings("unchecked")
Root<T> root = (Root<T>) o;
return entryId == root.entryId && word.equals(root.word);
}
@Override
public int hashCode() {
return Objects.hash(word, entryId);
}
@Override
public int compareTo(Root<T> o) {
return CharSequence.compare(word, o.word);
}
}

View File

@ -68,11 +68,12 @@ public class SpellChecker {
} }
char[] wordChars = word.toCharArray(); char[] wordChars = word.toCharArray();
if (dictionary.isForbiddenWord(wordChars, wordChars.length)) { Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null);
return false; if (simpleResult != null) {
return simpleResult;
} }
if (checkWord(wordChars, wordChars.length, null)) { if (checkCompounds(wordChars, wordChars.length, null)) {
return true; return true;
} }
@ -105,12 +106,9 @@ public class SpellChecker {
} }
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) { Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.isForbiddenWord(wordChars, length)) { Root<CharsRef> entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
return false; if (entry != null) {
} return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
return true;
} }
return null; return null;
@ -122,6 +120,10 @@ public class SpellChecker {
return simpleResult; return simpleResult;
} }
return checkCompounds(wordChars, length, originalCase);
}
private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.compoundRules != null if (dictionary.compoundRules != null
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
return true; return true;
@ -134,9 +136,10 @@ public class SpellChecker {
return false; return false;
} }
private CharsRef findStem( private Root<CharsRef> findStem(
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
CharsRef[] result = {null}; @SuppressWarnings({"rawtypes", "unchecked"})
Root<CharsRef>[] result = new Root[1];
stemmer.doStem( stemmer.doStem(
wordChars, wordChars,
offset, offset,
@ -145,7 +148,7 @@ public class SpellChecker {
context, context,
(stem, formID, stemException) -> { (stem, formID, stemException) -> {
if (acceptsStem(formID)) { if (acceptsStem(formID)) {
result[0] = stem; result[0] = new Root<>(stem, formID);
} }
return false; return false;
}); });
@ -164,13 +167,15 @@ public class SpellChecker {
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakOffset = word.offset + breakPos; int breakOffset = word.offset + breakPos;
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context); Root<CharsRef> stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
if (stem == null if (stem == null
&& dictionary.simplifiedTriple && dictionary.simplifiedTriple
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) { && word.chars[breakOffset - 1] == word.chars[breakOffset]) {
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context); stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
} }
if (stem != null && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) { if (stem != null
&& !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
&& (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null); CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
if (checkCompoundsAfter(originalCase, part)) { if (checkCompoundsAfter(originalCase, part)) {
return true; return true;
@ -193,7 +198,8 @@ public class SpellChecker {
if (expanded != null) { if (expanded != null) {
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE; WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakPos = pos + pattern.endLength(); int breakPos = pos + pattern.endLength();
CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context); Root<CharsRef> stem =
findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
if (stem != null) { if (stem != null) {
CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern); CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern);
if (checkCompoundsAfter(originalCase, part)) { if (checkCompoundsAfter(originalCase, part)) {
@ -210,10 +216,11 @@ public class SpellChecker {
int breakPos = prev.length; int breakPos = prev.length;
int remainingLength = word.length - breakPos; int remainingLength = word.length - breakPos;
int breakOffset = word.offset + breakPos; int breakOffset = word.offset + breakPos;
CharsRef tailStem = Root<CharsRef> tailStem =
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
if (tailStem != null if (tailStem != null
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem)) && !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase) && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
&& prev.mayCompound(tailStem, remainingLength, originalCase)) { && prev.mayCompound(tailStem, remainingLength, originalCase)) {
return true; return true;
@ -232,7 +239,7 @@ public class SpellChecker {
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase); return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
} }
private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) { private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
return cr1.toString().equalsIgnoreCase(cr2.toString()); return cr1.toString().equalsIgnoreCase(cr2.toString());
} }
@ -243,11 +250,15 @@ public class SpellChecker {
final CheckCompoundPattern enablingPattern; final CheckCompoundPattern enablingPattern;
CompoundPart( CompoundPart(
CompoundPart prev, CharsRef tail, int length, CharsRef stem, CheckCompoundPattern enabler) { CompoundPart prev,
CharsRef tail,
int length,
Root<CharsRef> stem,
CheckCompoundPattern enabler) {
this.prev = prev; this.prev = prev;
this.tail = tail; this.tail = tail;
this.length = length; this.length = length;
this.stem = stem; this.stem = stem.word;
index = prev == null ? 1 : prev.index + 1; index = prev == null ? 1 : prev.index + 1;
enablingPattern = enabler; enablingPattern = enabler;
} }
@ -257,12 +268,12 @@ public class SpellChecker {
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length); return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
} }
boolean mayCompound(CharsRef nextStem, int nextPartLength, WordCase originalCase) { boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
boolean patternsOk = boolean patternsOk =
enablingPattern != null enablingPattern != null
? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem) ? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
: dictionary.checkCompoundPatterns.stream() : dictionary.checkCompoundPatterns.stream()
.noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem)); .noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
if (!patternsOk) { if (!patternsOk) {
return false; return false;
} }
@ -498,7 +509,7 @@ public class SpellChecker {
if (!spell(chunk)) { if (!spell(chunk)) {
for (String chunkSug : suggest(chunk)) { for (String chunkSug : suggest(chunk)) {
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd); String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length())) { if (spell(replaced)) {
result.add(replaced); result.add(replaced);
} }
} }

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
@ -94,10 +93,6 @@ final class Stemmer {
word = scratchBuffer; word = scratchBuffer;
} }
if (dictionary.isForbiddenWord(word, length)) {
return Collections.emptyList();
}
List<CharsRef> list = new ArrayList<>(); List<CharsRef> list = new ArrayList<>();
RootProcessor processor = RootProcessor processor =
(stem, formID, stemException) -> { (stem, formID, stemException) -> {

View File

@ -172,6 +172,18 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("onlyincompound2"); doTest("onlyincompound2");
} }
public void testForbiddenWord() throws Exception {
doTest("forbiddenword");
}
public void testForbiddenWord1() throws Exception {
doTest("opentaal_forbiddenword1");
}
public void testForbiddenWord2() throws Exception {
doTest("opentaal_forbiddenword2");
}
public void testGermanCompounding() throws Exception { public void testGermanCompounding() throws Exception {
doTest("germancompounding"); doTest("germancompounding");
} }

View File

@ -27,6 +27,5 @@ public class TestDutchIJ extends StemmerTestBase {
public void testStemming() { public void testStemming() {
assertStemsTo("ijs", "ijs"); assertStemsTo("ijs", "ijs");
assertStemsTo("IJs", "ijs"); assertStemsTo("IJs", "ijs");
assertStemsTo("Ijs");
} }
} }

View File

@ -0,0 +1,11 @@
# FORBIDDENWORD flag
# The signed word, and its suffixed forms are all forbidden,
# excepts with root homonyms.
# Useful for forbidding bad suffixed forms or compounds.
FORBIDDENWORD X
COMPOUNDFLAG Y
SFX A Y 1
SFX A 0 s .

View File

@ -0,0 +1,11 @@
10
foo/S
foo/YX
bar/YS
bars/X
foos/X
kg
Kg/X
KG/X
cm
Cm/X

View File

@ -0,0 +1,4 @@
bars
foos
foobar
barfoo

View File

@ -0,0 +1,9 @@
TRY r
FORBIDDENWORD F
COMPOUNDRULE 2
COMPOUNDRULE WW
COMPOUNDRULE WWW
SFX S Y 1
SFX S 0 s .

View File

@ -0,0 +1,5 @@
4
foo/W
word/W
bar/WS
foowordbar/FS

View File

@ -0,0 +1,3 @@
fooword
wordbar
barwordfoo

View File

@ -0,0 +1,5 @@
foowordbar
foowordbars
foowordba
foowordbas
barwodfoo

View File

@ -0,0 +1,7 @@
TRY r
FORBIDDENWORD F
COMPOUNDFLAG W
SFX S Y 1
SFX S 0 s .

View File

@ -0,0 +1,5 @@
3
foo/WS
word/W
bar/WS
foowordbar/FS

View File

@ -0,0 +1,4 @@
fooword
wordbar
barwordfoo
barwordfoos

View File

@ -0,0 +1,5 @@
foowordbar
foowordbars
foowordba
foowordbas
barwodfoo