mirror of https://github.com/apache/lucene.git
LUCENE-9763: Hunspell: fix FORBIDDENWORD support (#2351)
don't decompound if it's a simple word with a forbidden root, don't lookup the word twice, don't forbid stemming (be like Hunspell)
This commit is contained in:
parent
01e34f8723
commit
019872453d
|
@ -1407,14 +1407,6 @@ public class Dictionary {
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isForbiddenWord(char[] word, int length) {
|
|
||||||
if (forbiddenword != FLAG_UNSET) {
|
|
||||||
IntsRef forms = lookupWord(word, 0, length);
|
|
||||||
return forms != null && hasFlag(forms, forbiddenword);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean hasFlag(IntsRef forms, char flag) {
|
boolean hasFlag(IntsRef forms, char flag) {
|
||||||
int formStep = formStep();
|
int formStep = formStep();
|
||||||
for (int i = 0; i < forms.length; i += formStep) {
|
for (int i = 0; i < forms.length; i += formStep) {
|
||||||
|
|
|
@ -51,22 +51,22 @@ class GeneratingSuggester {
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) {
|
List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) {
|
||||||
List<Weighted<DictEntry>> roots = findSimilarDictionaryEntries(word, originalCase);
|
List<Weighted<Root<String>>> roots = findSimilarDictionaryEntries(word, originalCase);
|
||||||
List<Weighted<String>> expanded = expandRoots(word, roots);
|
List<Weighted<String>> expanded = expandRoots(word, roots);
|
||||||
TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded);
|
TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded);
|
||||||
return getMostRelevantSuggestions(bySimilarity, prevSuggestions);
|
return getMostRelevantSuggestions(bySimilarity, prevSuggestions);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Weighted<DictEntry>> findSimilarDictionaryEntries(
|
private List<Weighted<Root<String>>> findSimilarDictionaryEntries(
|
||||||
String word, WordCase originalCase) {
|
String word, WordCase originalCase) {
|
||||||
TreeSet<Weighted<DictEntry>> roots = new TreeSet<>();
|
TreeSet<Weighted<Root<String>>> roots = new TreeSet<>();
|
||||||
processFST(
|
processFST(
|
||||||
dictionary.words,
|
dictionary.words,
|
||||||
(key, forms) -> {
|
(key, forms) -> {
|
||||||
if (Math.abs(key.length - word.length()) > 4) return;
|
if (Math.abs(key.length - word.length()) > 4) return;
|
||||||
|
|
||||||
String root = toString(key);
|
String root = toString(key);
|
||||||
List<DictEntry> entries = filterSuitableEntries(root, forms);
|
List<Root<String>> entries = filterSuitableEntries(root, forms);
|
||||||
if (entries.isEmpty()) return;
|
if (entries.isEmpty()) return;
|
||||||
|
|
||||||
if (originalCase == WordCase.LOWER
|
if (originalCase == WordCase.LOWER
|
||||||
|
@ -106,8 +106,8 @@ class GeneratingSuggester {
|
||||||
return new String(chars);
|
return new String(chars);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<DictEntry> filterSuitableEntries(String word, IntsRef forms) {
|
private List<Root<String>> filterSuitableEntries(String word, IntsRef forms) {
|
||||||
List<DictEntry> result = new ArrayList<>();
|
List<Root<String>> result = new ArrayList<>();
|
||||||
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
|
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
|
||||||
int entryId = forms.ints[forms.offset + i];
|
int entryId = forms.ints[forms.offset + i];
|
||||||
if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
|
if (dictionary.hasFlag(entryId, dictionary.forbiddenword)
|
||||||
|
@ -116,17 +116,18 @@ class GeneratingSuggester {
|
||||||
|| dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
|| dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
result.add(new DictEntry(word, entryId));
|
result.add(new Root<>(word, entryId));
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Weighted<String>> expandRoots(String misspelled, List<Weighted<DictEntry>> roots) {
|
private List<Weighted<String>> expandRoots(
|
||||||
|
String misspelled, List<Weighted<Root<String>>> roots) {
|
||||||
int thresh = calcThreshold(misspelled);
|
int thresh = calcThreshold(misspelled);
|
||||||
|
|
||||||
TreeSet<Weighted<String>> expanded = new TreeSet<>();
|
TreeSet<Weighted<String>> expanded = new TreeSet<>();
|
||||||
for (Weighted<DictEntry> weighted : roots) {
|
for (Weighted<Root<String>> weighted : roots) {
|
||||||
for (String guess : expandRoot(weighted.word, misspelled)) {
|
for (String guess : expandRoot(weighted.word, misspelled)) {
|
||||||
String lower = dictionary.toLowerCase(guess);
|
String lower = dictionary.toLowerCase(guess);
|
||||||
int sc =
|
int sc =
|
||||||
|
@ -156,7 +157,7 @@ class GeneratingSuggester {
|
||||||
return thresh / 3 - 1;
|
return thresh / 3 - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> expandRoot(DictEntry root, String misspelled) {
|
private List<String> expandRoot(Root<String> root, String misspelled) {
|
||||||
List<String> crossProducts = new ArrayList<>();
|
List<String> crossProducts = new ArrayList<>();
|
||||||
Set<String> result = new LinkedHashSet<>();
|
Set<String> result = new LinkedHashSet<>();
|
||||||
|
|
||||||
|
@ -226,7 +227,7 @@ class GeneratingSuggester {
|
||||||
return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
|
return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasCompatibleFlags(DictEntry root, int affixId) {
|
private boolean hasCompatibleFlags(Root<?> root, int affixId) {
|
||||||
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
|
if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -434,37 +435,4 @@ class GeneratingSuggester {
|
||||||
return cmp != 0 ? -cmp : word.compareTo(o.word);
|
return cmp != 0 ? -cmp : word.compareTo(o.word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DictEntry implements Comparable<DictEntry> {
|
|
||||||
private final String word;
|
|
||||||
private final int entryId;
|
|
||||||
|
|
||||||
DictEntry(String word, int entryId) {
|
|
||||||
this.word = word;
|
|
||||||
this.entryId = entryId;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return word;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o) {
|
|
||||||
if (this == o) return true;
|
|
||||||
if (!(o instanceof DictEntry)) return false;
|
|
||||||
DictEntry dictEntry = (DictEntry) o;
|
|
||||||
return entryId == dictEntry.entryId && word.equals(dictEntry.word);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return Objects.hash(word, entryId);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(DictEntry o) {
|
|
||||||
return word.compareTo(o.word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
class Root<T extends CharSequence> implements Comparable<Root<T>> {
|
||||||
|
final T word;
|
||||||
|
final int entryId;
|
||||||
|
|
||||||
|
Root(T word, int entryId) {
|
||||||
|
this.word = word;
|
||||||
|
this.entryId = entryId;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return word.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (!(o instanceof Root)) return false;
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
Root<T> root = (Root<T>) o;
|
||||||
|
return entryId == root.entryId && word.equals(root.word);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(word, entryId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(Root<T> o) {
|
||||||
|
return CharSequence.compare(word, o.word);
|
||||||
|
}
|
||||||
|
}
|
|
@ -68,11 +68,12 @@ public class SpellChecker {
|
||||||
}
|
}
|
||||||
|
|
||||||
char[] wordChars = word.toCharArray();
|
char[] wordChars = word.toCharArray();
|
||||||
if (dictionary.isForbiddenWord(wordChars, wordChars.length)) {
|
Boolean simpleResult = checkSimpleWord(wordChars, wordChars.length, null);
|
||||||
return false;
|
if (simpleResult != null) {
|
||||||
|
return simpleResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkWord(wordChars, wordChars.length, null)) {
|
if (checkCompounds(wordChars, wordChars.length, null)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,12 +106,9 @@ public class SpellChecker {
|
||||||
}
|
}
|
||||||
|
|
||||||
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
|
Boolean checkSimpleWord(char[] wordChars, int length, WordCase originalCase) {
|
||||||
if (dictionary.isForbiddenWord(wordChars, length)) {
|
Root<CharsRef> entry = findStem(wordChars, 0, length, originalCase, SIMPLE_WORD);
|
||||||
return false;
|
if (entry != null) {
|
||||||
}
|
return !dictionary.hasFlag(entry.entryId, dictionary.forbiddenword);
|
||||||
|
|
||||||
if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
@ -122,6 +120,10 @@ public class SpellChecker {
|
||||||
return simpleResult;
|
return simpleResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return checkCompounds(wordChars, length, originalCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkCompounds(char[] wordChars, int length, WordCase originalCase) {
|
||||||
if (dictionary.compoundRules != null
|
if (dictionary.compoundRules != null
|
||||||
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
|
&& checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -134,9 +136,10 @@ public class SpellChecker {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private CharsRef findStem(
|
private Root<CharsRef> findStem(
|
||||||
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
|
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||||
CharsRef[] result = {null};
|
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||||
|
Root<CharsRef>[] result = new Root[1];
|
||||||
stemmer.doStem(
|
stemmer.doStem(
|
||||||
wordChars,
|
wordChars,
|
||||||
offset,
|
offset,
|
||||||
|
@ -145,7 +148,7 @@ public class SpellChecker {
|
||||||
context,
|
context,
|
||||||
(stem, formID, stemException) -> {
|
(stem, formID, stemException) -> {
|
||||||
if (acceptsStem(formID)) {
|
if (acceptsStem(formID)) {
|
||||||
result[0] = stem;
|
result[0] = new Root<>(stem, formID);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
});
|
});
|
||||||
|
@ -164,13 +167,15 @@ public class SpellChecker {
|
||||||
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||||
int breakOffset = word.offset + breakPos;
|
int breakOffset = word.offset + breakPos;
|
||||||
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
|
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
|
||||||
CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
|
Root<CharsRef> stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
|
||||||
if (stem == null
|
if (stem == null
|
||||||
&& dictionary.simplifiedTriple
|
&& dictionary.simplifiedTriple
|
||||||
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
|
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
|
||||||
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
|
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
|
||||||
}
|
}
|
||||||
if (stem != null && (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
|
if (stem != null
|
||||||
|
&& !dictionary.hasFlag(stem.entryId, dictionary.forbiddenword)
|
||||||
|
&& (prev == null || prev.mayCompound(stem, breakPos, originalCase))) {
|
||||||
CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
|
CompoundPart part = new CompoundPart(prev, word, breakPos, stem, null);
|
||||||
if (checkCompoundsAfter(originalCase, part)) {
|
if (checkCompoundsAfter(originalCase, part)) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -193,7 +198,8 @@ public class SpellChecker {
|
||||||
if (expanded != null) {
|
if (expanded != null) {
|
||||||
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
WordContext context = prev == null ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||||
int breakPos = pos + pattern.endLength();
|
int breakPos = pos + pattern.endLength();
|
||||||
CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
|
Root<CharsRef> stem =
|
||||||
|
findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
|
||||||
if (stem != null) {
|
if (stem != null) {
|
||||||
CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern);
|
CompoundPart part = new CompoundPart(prev, expanded, breakPos, stem, pattern);
|
||||||
if (checkCompoundsAfter(originalCase, part)) {
|
if (checkCompoundsAfter(originalCase, part)) {
|
||||||
|
@ -210,10 +216,11 @@ public class SpellChecker {
|
||||||
int breakPos = prev.length;
|
int breakPos = prev.length;
|
||||||
int remainingLength = word.length - breakPos;
|
int remainingLength = word.length - breakPos;
|
||||||
int breakOffset = word.offset + breakPos;
|
int breakOffset = word.offset + breakPos;
|
||||||
CharsRef tailStem =
|
Root<CharsRef> tailStem =
|
||||||
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||||
if (tailStem != null
|
if (tailStem != null
|
||||||
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem))
|
&& !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
|
||||||
|
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
|
||||||
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
|
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
|
||||||
&& prev.mayCompound(tailStem, remainingLength, originalCase)) {
|
&& prev.mayCompound(tailStem, remainingLength, originalCase)) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -232,7 +239,7 @@ public class SpellChecker {
|
||||||
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
|
return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean equalsIgnoreCase(CharsRef cr1, CharsRef cr2) {
|
private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
|
||||||
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -243,11 +250,15 @@ public class SpellChecker {
|
||||||
final CheckCompoundPattern enablingPattern;
|
final CheckCompoundPattern enablingPattern;
|
||||||
|
|
||||||
CompoundPart(
|
CompoundPart(
|
||||||
CompoundPart prev, CharsRef tail, int length, CharsRef stem, CheckCompoundPattern enabler) {
|
CompoundPart prev,
|
||||||
|
CharsRef tail,
|
||||||
|
int length,
|
||||||
|
Root<CharsRef> stem,
|
||||||
|
CheckCompoundPattern enabler) {
|
||||||
this.prev = prev;
|
this.prev = prev;
|
||||||
this.tail = tail;
|
this.tail = tail;
|
||||||
this.length = length;
|
this.length = length;
|
||||||
this.stem = stem;
|
this.stem = stem.word;
|
||||||
index = prev == null ? 1 : prev.index + 1;
|
index = prev == null ? 1 : prev.index + 1;
|
||||||
enablingPattern = enabler;
|
enablingPattern = enabler;
|
||||||
}
|
}
|
||||||
|
@ -257,12 +268,12 @@ public class SpellChecker {
|
||||||
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
|
return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean mayCompound(CharsRef nextStem, int nextPartLength, WordCase originalCase) {
|
boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
|
||||||
boolean patternsOk =
|
boolean patternsOk =
|
||||||
enablingPattern != null
|
enablingPattern != null
|
||||||
? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem)
|
? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
|
||||||
: dictionary.checkCompoundPatterns.stream()
|
: dictionary.checkCompoundPatterns.stream()
|
||||||
.noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem));
|
.noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
|
||||||
if (!patternsOk) {
|
if (!patternsOk) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -498,7 +509,7 @@ public class SpellChecker {
|
||||||
if (!spell(chunk)) {
|
if (!spell(chunk)) {
|
||||||
for (String chunkSug : suggest(chunk)) {
|
for (String chunkSug : suggest(chunk)) {
|
||||||
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
|
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
|
||||||
if (!dictionary.isForbiddenWord(replaced.toCharArray(), replaced.length())) {
|
if (spell(replaced)) {
|
||||||
result.add(replaced);
|
result.add(replaced);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
@ -94,10 +93,6 @@ final class Stemmer {
|
||||||
word = scratchBuffer;
|
word = scratchBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dictionary.isForbiddenWord(word, length)) {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
|
|
||||||
List<CharsRef> list = new ArrayList<>();
|
List<CharsRef> list = new ArrayList<>();
|
||||||
RootProcessor processor =
|
RootProcessor processor =
|
||||||
(stem, formID, stemException) -> {
|
(stem, formID, stemException) -> {
|
||||||
|
|
|
@ -172,6 +172,18 @@ public class SpellCheckerTest extends StemmerTestBase {
|
||||||
doTest("onlyincompound2");
|
doTest("onlyincompound2");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testForbiddenWord() throws Exception {
|
||||||
|
doTest("forbiddenword");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testForbiddenWord1() throws Exception {
|
||||||
|
doTest("opentaal_forbiddenword1");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testForbiddenWord2() throws Exception {
|
||||||
|
doTest("opentaal_forbiddenword2");
|
||||||
|
}
|
||||||
|
|
||||||
public void testGermanCompounding() throws Exception {
|
public void testGermanCompounding() throws Exception {
|
||||||
doTest("germancompounding");
|
doTest("germancompounding");
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,5 @@ public class TestDutchIJ extends StemmerTestBase {
|
||||||
public void testStemming() {
|
public void testStemming() {
|
||||||
assertStemsTo("ijs", "ijs");
|
assertStemsTo("ijs", "ijs");
|
||||||
assertStemsTo("IJs", "ijs");
|
assertStemsTo("IJs", "ijs");
|
||||||
assertStemsTo("Ijs");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
# FORBIDDENWORD flag
|
||||||
|
# The signed word, and its suffixed forms are all forbidden,
|
||||||
|
# excepts with root homonyms.
|
||||||
|
# Useful for forbidding bad suffixed forms or compounds.
|
||||||
|
|
||||||
|
|
||||||
|
FORBIDDENWORD X
|
||||||
|
COMPOUNDFLAG Y
|
||||||
|
|
||||||
|
SFX A Y 1
|
||||||
|
SFX A 0 s .
|
|
@ -0,0 +1,11 @@
|
||||||
|
10
|
||||||
|
foo/S
|
||||||
|
foo/YX
|
||||||
|
bar/YS
|
||||||
|
bars/X
|
||||||
|
foos/X
|
||||||
|
kg
|
||||||
|
Kg/X
|
||||||
|
KG/X
|
||||||
|
cm
|
||||||
|
Cm/X
|
|
@ -0,0 +1,3 @@
|
||||||
|
foo
|
||||||
|
bar
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
bars
|
||||||
|
foos
|
||||||
|
foobar
|
||||||
|
barfoo
|
|
@ -0,0 +1,9 @@
|
||||||
|
TRY r
|
||||||
|
|
||||||
|
FORBIDDENWORD F
|
||||||
|
COMPOUNDRULE 2
|
||||||
|
COMPOUNDRULE WW
|
||||||
|
COMPOUNDRULE WWW
|
||||||
|
|
||||||
|
SFX S Y 1
|
||||||
|
SFX S 0 s .
|
|
@ -0,0 +1,5 @@
|
||||||
|
4
|
||||||
|
foo/W
|
||||||
|
word/W
|
||||||
|
bar/WS
|
||||||
|
foowordbar/FS
|
|
@ -0,0 +1,3 @@
|
||||||
|
fooword
|
||||||
|
wordbar
|
||||||
|
barwordfoo
|
|
@ -0,0 +1 @@
|
||||||
|
barwordfoo
|
|
@ -0,0 +1,5 @@
|
||||||
|
foowordbar
|
||||||
|
foowordbars
|
||||||
|
foowordba
|
||||||
|
foowordbas
|
||||||
|
barwodfoo
|
|
@ -0,0 +1,7 @@
|
||||||
|
TRY r
|
||||||
|
|
||||||
|
FORBIDDENWORD F
|
||||||
|
COMPOUNDFLAG W
|
||||||
|
|
||||||
|
SFX S Y 1
|
||||||
|
SFX S 0 s .
|
|
@ -0,0 +1,5 @@
|
||||||
|
3
|
||||||
|
foo/WS
|
||||||
|
word/W
|
||||||
|
bar/WS
|
||||||
|
foowordbar/FS
|
|
@ -0,0 +1,4 @@
|
||||||
|
fooword
|
||||||
|
wordbar
|
||||||
|
barwordfoo
|
||||||
|
barwordfoos
|
|
@ -0,0 +1 @@
|
||||||
|
barwordfoo
|
|
@ -0,0 +1,5 @@
|
||||||
|
foowordbar
|
||||||
|
foowordbars
|
||||||
|
foowordba
|
||||||
|
foowordbas
|
||||||
|
barwodfoo
|
Loading…
Reference in New Issue