LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (#975)

This commit is contained in:
Peter Gromov 2022-07-05 21:38:03 +02:00 committed by GitHub
parent 3dd9a5487c
commit d537013e70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 1352 additions and 171 deletions

View File

@ -36,6 +36,9 @@ New Features
* LUCENE-10151 Enable timeout support in IndexSearcher. (Deepika Sharma)
* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
Improvements
---------------------

View File

@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
/** An object representing the analysis result of a simple (non-compound) word */
public final class AffixedWord {
private final String word;
private final DictEntry entry;
private final List<Affix> prefixes;
private final List<Affix> suffixes;
AffixedWord(String word, DictEntry entry, List<Affix> prefixes, List<Affix> suffixes) {
this.word = word;
this.entry = entry;
this.prefixes = Collections.unmodifiableList(prefixes);
this.suffixes = Collections.unmodifiableList(suffixes);
}
/** @return the word being analyzed */
public String getWord() {
return word;
}
/** @return the dictionary entry for the stem in this analysis */
public DictEntry getDictEntry() {
return entry;
}
/** @return the list of prefixes applied to the stem, at most two, outermost first */
public List<Affix> getPrefixes() {
return prefixes;
}
/** @return the list of suffixes applied to the stem, at most two, outermost first */
public List<Affix> getSuffixes() {
return suffixes;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof AffixedWord that)) return false;
return word.equals(that.word)
&& entry.equals(that.entry)
&& prefixes.equals(that.prefixes)
&& suffixes.equals(that.suffixes);
}
@Override
public int hashCode() {
return Objects.hash(word, entry, prefixes, suffixes);
}
@Override
public String toString() {
return "AffixedWord["
+ ("word=" + word + ", ")
+ ("entry=" + entry + ", ")
+ ("prefixes=" + prefixes + ", ")
+ ("suffixes=" + suffixes)
+ "]";
}
/** An object representing a prefix or a suffix applied to a word stem */
public static final class Affix {
final int affixId;
private final String presentableFlag;
Affix(Dictionary dictionary, int affixId) {
this.affixId = affixId;
char encodedFlag = dictionary.affixData(affixId, AFFIX_FLAG);
presentableFlag = dictionary.flagParsingStrategy.printFlag(encodedFlag);
}
/**
* @return the corresponding affix flag as it appears in the *.aff file. Depending on the
* format, it could be a Unicode character, two ASCII characters, or an integer in decimal
* form
*/
public String getFlag() {
return presentableFlag;
}
@Override
public boolean equals(Object o) {
return this == o || o instanceof Affix a && affixId == a.affixId;
}
@Override
public int hashCode() {
return affixId;
}
@Override
public String toString() {
return presentableFlag + "(id=" + affixId + ")";
}
}
}

View File

@ -24,26 +24,22 @@ import java.util.List;
*
* @see Dictionary#lookupEntries
*/
public interface DictEntries {
public interface DictEntries extends List<DictEntry> {
/**
* @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
* there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
* be passed into other methods of this class.
*/
@Override
int size();
/**
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
* {@code ph:}) associated with the homonym at the given entry index, or an empty string
*/
String getMorphologicalData(int entryIndex);
/**
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
* @param key the key in the form {@code kk:} by which to filter the morphological fields
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key
* associated with the homonym at the given entry index
*/
List<String> getMorphologicalValues(int entryIndex, String key);
/** Same as {@code get(entryIndex).getMorphologicalData()} */
default String getMorphologicalData(int entryIndex) {
return get(entryIndex).getMorphologicalData();
}
/** Same as {@code get(entryIndex).getMorphologicalValues(key)} */
default List<String> getMorphologicalValues(int entryIndex, String key) {
return get(entryIndex).getMorphologicalValues(key);
}
}

View File

@ -0,0 +1,109 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
/** An object representing *.dic file entry with its word, flags and morphological data. */
public abstract class DictEntry {
private final String stem;
DictEntry(String stem) {
this.stem = stem;
}
@Override
public String toString() {
String result = stem;
String flags = getFlags();
if (!flags.isEmpty()) {
result += "/" + flags;
}
String morph = getMorphologicalData();
if (!morph.isEmpty()) {
result += " " + morph;
}
return result;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof DictEntry that)) return false;
return stem.equals(that.stem)
&& getMorphologicalData().equals(that.getMorphologicalData())
&& getFlags().equals(that.getFlags());
}
@Override
public int hashCode() {
return Objects.hash(stem, getFlags(), getMorphologicalData());
}
/** @return the stem word in the dictionary */
public String getStem() {
return stem;
}
/**
* @return the flags associated with the dictionary entry, encoded in the same format as in the
* *.dic file, but possibly in a different order
*/
public abstract String getFlags();
/**
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
* {@code ph:}) associated with the homonym at the given entry index, or an empty string
*/
public abstract String getMorphologicalData();
/**
* @param key the key in the form {@code kk:} by which to filter the morphological fields
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key
* associated with the homonym at the given entry index
*/
public List<String> getMorphologicalValues(String key) {
assert key.length() == 3 && key.charAt(2) == ':'
: "A morphological data key should consist of two letters followed by a semicolon, found: "
+ key;
String data = getMorphologicalData();
if (data.isEmpty() || !data.contains(key)) return Collections.emptyList();
return Arrays.stream(data.split(" "))
.filter(s -> s.startsWith(key))
.map(s -> s.substring(3))
.toList();
}
static DictEntry create(String stem, String flags) {
return new DictEntry(stem) {
@Override
public String getFlags() {
return flags;
}
@Override
public String getMorphologicalData() {
return "";
}
};
}
}

View File

@ -34,6 +34,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@ -537,31 +538,33 @@ public class Dictionary {
IntsRef forms = lookupWord(root.toCharArray(), 0, root.length());
if (forms == null) return null;
return new DictEntries() {
class DictEntriesImpl extends AbstractList<DictEntry> implements DictEntries {
@Override
public int size() {
return forms.length / (hasCustomMorphData ? 2 : 1);
return forms.length / formStep();
}
@Override
public String getMorphologicalData(int entryIndex) {
if (!hasCustomMorphData) return "";
return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]);
public DictEntry get(int entryIndex) {
return dictEntry(
root,
forms.ints[forms.offset + (entryIndex * formStep())],
hasCustomMorphData ? forms.ints[forms.offset + entryIndex * 2 + 1] : 0);
}
}
return new DictEntriesImpl();
}
DictEntry dictEntry(String root, int flagId, int morphDataId) {
return new DictEntry(root) {
@Override
public String getFlags() {
return Dictionary.this.flagParsingStrategy.printFlags(flagLookup.getFlags(flagId));
}
@Override
public List<String> getMorphologicalValues(int entryIndex, String key) {
assert key.length() == 3 && key.charAt(2) == ':'
: "A morphological data key should consist of two letters followed by a semicolon, found: "
+ key;
String fields = getMorphologicalData(entryIndex);
if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList();
return Arrays.stream(fields.split(" "))
.filter(s -> s.startsWith(key))
.map(s -> s.substring(3))
.collect(Collectors.toList());
public String getMorphologicalData() {
return morphDataId == 0 ? "" : morphData.get(morphDataId);
}
};
}
@ -1153,7 +1156,7 @@ public class Dictionary {
} else {
end = line.indexOf(MORPH_SEPARATOR);
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end);
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
if (aliasCount > 0 && !flagPart.isEmpty()) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
@ -1327,6 +1330,12 @@ public class Dictionary {
return false;
}
boolean isFlagAppendedByAffix(int affixId, char flag) {
if (affixId < 0 || flag == FLAG_UNSET) return false;
int appendId = affixData(affixId, AFFIX_APPEND);
return hasFlag(appendId, flag);
}
/** Abstraction of the process of parsing flags taken from the affix and dic files */
abstract static class FlagParsingStrategy {
// we don't check the flag count, as Hunspell accepts longer sequences
@ -1354,6 +1363,27 @@ public class Dictionary {
* @return Parsed flags
*/
abstract char[] parseFlags(String rawFlags);
/**
* @return the original string representation of the given flag encoded by {@link #parseFlags}.
*/
abstract String printFlag(char flag);
/** @return a presentable sorted concatenation of {@link #printFlag} results */
String printFlags(char[] encodedFlags) {
List<String> printed = new ArrayList<>();
for (char c : encodedFlags) {
if (c >= DEFAULT_FLAGS) continue;
printed.add(printFlag(c));
}
String delimiter = this instanceof NumFlagParsingStrategy ? "," : "";
return printed.stream().sorted().collect(Collectors.joining(delimiter));
}
/** Parse flags from a string resulting from {@link #printFlags} */
char[] parseUtfFlags(String flagsInUtf) {
return parseFlags(flagsInUtf);
}
}
/**
@ -1365,6 +1395,11 @@ public class Dictionary {
public char[] parseFlags(String rawFlags) {
return rawFlags.toCharArray();
}
@Override
String printFlag(char flag) {
return String.valueOf(flag);
}
}
/** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */
@ -1373,6 +1408,16 @@ public class Dictionary {
public char[] parseFlags(String rawFlags) {
return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray();
}
@Override
String printFlag(char flag) {
return String.valueOf(flag);
}
@Override
char[] parseUtfFlags(String flagsInUtf) {
return flagsInUtf.toCharArray();
}
}
/**
@ -1403,6 +1448,11 @@ public class Dictionary {
return result.toString().toCharArray();
}
@Override
String printFlag(char flag) {
return String.valueOf((int) flag);
}
}
/**
@ -1430,6 +1480,11 @@ public class Dictionary {
}
return flags;
}
@Override
String printFlag(char flag) {
return new String(new char[] {(char) ((flag & 0xff00) >>> 8), (char) (flag & 0xff)});
}
}
boolean hasFlag(int entryId, char flag) {

View File

@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.util.Collections;
import java.util.List;
/**
* Suggestion to add/edit dictionary entries to generate a given list of words created by {@link
* WordFormGenerator#compress}.
*/
public class EntrySuggestion {
private final List<DictEntry> toEdit, toAdd;
private final List<String> extraGenerated;
EntrySuggestion(List<DictEntry> toEdit, List<DictEntry> toAdd, List<String> extraGenerated) {
this.toEdit = Collections.unmodifiableList(toEdit);
this.toAdd = Collections.unmodifiableList(toAdd);
this.extraGenerated = Collections.unmodifiableList(extraGenerated);
}
/**
* @return the existing dictionary entries whose flags would need changing to accommodate the
* given word list
*/
public List<DictEntry> getEntriesToEdit() {
return toEdit;
}
/** @return new dictionary entries to be added to accommodate the given word list */
public List<DictEntry> getEntriesToAdd() {
return toAdd;
}
/**
* @return additional words generated by union of {@link #getEntriesToAdd()} and {@link
* #getEntriesToEdit()} which weren't in the given list of words
*/
public List<String> getExtraGeneratedWords() {
return extraGenerated;
}
@Override
public String toString() {
return "EntrySuggestion{" + internalsToString() + '}';
}
String internalsToString() {
return "toEdit=" + toEdit + ", toAdd=" + toAdd + ", extra=" + extraGenerated;
}
}

View File

@ -176,7 +176,7 @@ public class Hunspell {
offset,
length,
context,
(stem, formID, morphDataId) -> {
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
if (checkCase && !acceptCase(originalCase, formID, stem)) {
return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
}
@ -314,6 +314,52 @@ public class Hunspell {
.collect(Collectors.toList());
}
/**
* @return all possible analyses of the given word with stems, prefixes, suffixed and
* morphological data. Note that the order of the returned objects might not correspond to the
* *.dic file order!
*/
public List<AffixedWord> analyzeSimpleWord(String word) {
List<AffixedWord> result = new ArrayList<>();
stemmer.analyze(
word.toCharArray(),
word.length(),
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
List<AffixedWord.Affix> prefixes = new ArrayList<>();
List<AffixedWord.Affix> suffixes = new ArrayList<>();
if (outerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, outerPrefix));
if (innerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, innerPrefix));
if (outerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, outerSuffix));
if (innerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, innerSuffix));
DictEntry entry = dictionary.dictEntry(stem.toString(), formID, morphDataId);
result.add(new AffixedWord(word, entry, prefixes, suffixes));
return true;
});
return result;
}
/**
* Generate all word forms for all dictionary entries with the given root word. The result order
* is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
*
* @see WordFormGenerator for finer-grained APIs
*/
public List<AffixedWord> getAllWordForms(String root) {
return new WordFormGenerator(dictionary).getAllWordForms(root, checkCanceled);
}
/**
* Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
* that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
* package.
*
* @see WordFormGenerator#compress for more details and control
*/
public EntrySuggestion compress(List<String> words) {
return new WordFormGenerator(dictionary).compress(words, Set.of(), checkCanceled);
}
private class CompoundPart {
final CompoundPart prev;
final int index, length;
@ -431,7 +477,7 @@ public class Hunspell {
words.add(ref);
Stemmer.RootProcessor stopOnMatching =
(stem, formID, morphDataId) -> {
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
ref.ints[0] = formID;
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
};

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.ArrayUtil;
@ -65,7 +64,18 @@ final class Stemmer {
* @return List of stems for the word
*/
public List<CharsRef> stem(char[] word, int length) {
List<CharsRef> list = new ArrayList<>();
analyze(
word,
length,
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
list.add(newStem(stem, morphDataId));
return true;
});
return list;
}
void analyze(char[] word, int length, RootProcessor processor) {
if (dictionary.mayNeedInputCleaning()) {
CharsRef scratchSegment = new CharsRef(word, 0, length);
if (dictionary.needsInputCleaning(scratchSegment)) {
@ -77,19 +87,12 @@ final class Stemmer {
word = scratchBuffer;
}
}
List<CharsRef> list = new ArrayList<>();
if (length == 0) {
return list;
return;
}
RootProcessor processor =
(stem, formID, stemException) -> {
list.add(newStem(stem, stemException));
return true;
};
if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
return list;
return;
}
WordCase wordCase = caseOf(word, length);
@ -99,7 +102,6 @@ final class Stemmer {
doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
varyCase(word, length, wordCase, variationProcessor);
}
return list;
}
interface CaseVariationProcessor {
@ -214,7 +216,7 @@ final class Stemmer {
if (result == null) return true;
String src = new String(word, 0, length);
for (String s : result.collect(Collectors.toList())) {
for (String s : result.toList()) {
if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
return false;
}
@ -239,13 +241,61 @@ final class Stemmer {
if (!isRootCompatibleWithContext(context, -1, entryId)) {
continue;
}
if (!callProcessor(word, offset, length, processor, forms, i)) {
CharsRef charsRef = new CharsRef(word, offset, length);
if (!processor.processRoot(charsRef, entryId, morphDataId(forms, i), -1, -1, -1, -1)) {
return false;
}
}
}
return stem(
word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
StemCandidateProcessor stemProcessor =
new StemCandidateProcessor(context) {
@Override
boolean processStemCandidate(
char[] word,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix) {
IntsRef forms = dictionary.lookupWord(word, offset, length);
if (forms == null) return true;
char flag = dictionary.affixData(lastAffix, Dictionary.AFFIX_FLAG);
int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
for (int i = 0; i < forms.length; i += formStep) {
int entryId = forms.ints[forms.offset + i];
if (dictionary.hasFlag(entryId, flag)
|| dictionary.isFlagAppendedByAffix(prefixId, flag)) {
if (innerPrefix < 0 && outerPrefix >= 0) {
char prefixFlag = dictionary.affixData(outerPrefix, Dictionary.AFFIX_FLAG);
if (!dictionary.hasFlag(entryId, prefixFlag)
&& !dictionary.isFlagAppendedByAffix(lastAffix, prefixFlag)) {
continue;
}
}
if (!isRootCompatibleWithContext(context, lastAffix, entryId)) {
continue;
}
if (!processor.processRoot(
new CharsRef(word, offset, length),
entryId,
morphDataId(forms, i),
outerPrefix,
innerPrefix,
outerSuffix,
innerSuffix)) {
return false;
}
}
}
return true;
}
};
return removeAffixes(word, offset, length, true, -1, -1, -1, stemProcessor);
}
/**
@ -277,9 +327,20 @@ final class Stemmer {
* Dictionary#hasFlag(int, char)}
* @param morphDataId the id of the custom morphological data (0 if none), to be used with
* {@link Dictionary#morphData}
* @param outerPrefix the id of the outer prefix applied to the stem, or -1 if none
* @param innerPrefix the id of the inner prefix applied to the stem, or -1 if none
* @param outerSuffix the id of the outer suffix applied to the stem, or -1 if none
* @param innerSuffix the id of the inner suffix applied to the stem, or -1 if none
* @return whether the processing should be continued
*/
boolean processRoot(CharsRef stem, int formID, int morphDataId);
boolean processRoot(
CharsRef stem,
int formID,
int morphDataId,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix);
}
private String stemException(int morphDataId) {
@ -318,33 +379,23 @@ final class Stemmer {
}
/**
* Generates a list of stems for the provided word
* Generates a list of stems for the provided word. It's called recursively when applying affixes
* one by one, setting {@code (inner/outer)(Suffix/Prefix)} parameters to non-negative values as
* that happens.
*
* @param word Word to generate the stems for
* @param previous previous affix that was removed (so we dont remove same one twice)
* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
* affixes in this recursive step
* @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
* checked against the word
* @param recursionDepth current recursiondepth
* @param doPrefix true if we should remove prefixes
* @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
* suffix, and it has no continuation requirements, it's ok. but two prefixes
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @return whether the processing should be continued
*/
private boolean stem(
boolean removeAffixes(
char[] word,
int offset,
int length,
WordContext context,
int previous,
char prevFlag,
int prefixId,
int recursionDepth,
boolean doPrefix,
boolean previousWasPrefix,
RootProcessor processor) {
int outerPrefix,
int innerPrefix,
int outerSuffix,
StemCandidateProcessor processor) {
FST.Arc<IntsRef> arc = new FST.Arc<>();
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
@ -366,11 +417,11 @@ final class Stemmer {
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
if (prefix == previous) {
if (prefix == outerPrefix) {
continue;
}
if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
if (isAffixCompatible(prefix, true, outerPrefix, outerSuffix, processor.context)) {
char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
if (strippedWord == null) {
continue;
@ -381,12 +432,11 @@ final class Stemmer {
strippedWord,
pureAffix ? offset + i : 0,
pureAffix ? length - i : strippedWord.length,
context,
prefix,
previous,
-1,
recursionDepth,
true,
outerPrefix,
innerPrefix,
outerSuffix,
processor)) {
return false;
}
@ -415,12 +465,11 @@ final class Stemmer {
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
if (suffix == previous) {
if (suffix == outerSuffix) {
continue;
}
if (isAffixCompatible(
suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
if (isAffixCompatible(suffix, false, outerPrefix, outerSuffix, processor.context)) {
char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
if (strippedWord == null) {
continue;
@ -431,12 +480,11 @@ final class Stemmer {
strippedWord,
pureAffix ? offset : 0,
pureAffix ? i : strippedWord.length,
context,
suffix,
previous,
prefixId,
recursionDepth,
false,
outerPrefix,
innerPrefix,
outerSuffix,
processor)) {
return false;
}
@ -487,14 +535,10 @@ final class Stemmer {
}
private boolean isAffixCompatible(
int affix,
char prevFlag,
int recursionDepth,
boolean isPrefix,
boolean previousWasPrefix,
WordContext context) {
int affix, boolean isPrefix, int outerPrefix, int outerSuffix, WordContext context) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
boolean previousWasPrefix = outerSuffix < 0 && outerPrefix >= 0;
if (context.isCompound()) {
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
return false;
@ -513,79 +557,70 @@ final class Stemmer {
return false;
}
if (recursionDepth == 0) {
if (outerPrefix == -1 && outerSuffix == -1) {
return true;
}
if (dictionary.isCrossProduct(affix)) {
// cross check incoming continuation class (flag of previous affix) against list.
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
// cross-check incoming continuation class (flag of previous affix) against this affix's flags
if (previousWasPrefix) return true;
if (outerSuffix >= 0) {
char prevFlag = dictionary.affixData(outerSuffix, Dictionary.AFFIX_FLAG);
return dictionary.hasFlag(append, prevFlag);
}
}
return false;
}
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
* Applies the affix rule to the given word, producing a list of stems if any are found.
* Non-negative {@code (inner/outer)(Suffix/Prefix)} parameters indicate the already applied
* affixes.
*
* @param strippedWord Char array containing the word with the affix removed and the strip added
* @param word Char array containing the word with the affix removed and the strip added
* @param offset where the word actually starts in the array
* @param length the length of the stripped word
* @param affix HunspellAffix representing the affix rule itself
* @param prefixId when we already stripped a prefix, we can't simply recurse and check the
* suffix, unless both are compatible so we must check dictionary form against both to add it
* as a stem!
* @param recursionDepth current recursion depth
* @param affix the id of the affix in {@link Dictionary#affixData}
* @param prefix true if we are removing a prefix (false if it's a suffix)
* @return whether the processing should be continued
*/
private boolean applyAffix(
char[] strippedWord,
char[] word,
int offset,
int length,
WordContext context,
int affix,
int previousAffix,
int prefixId,
int recursionDepth,
boolean prefix,
RootProcessor processor) {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
int outerPrefix,
int innerPrefix,
int outerSuffix,
StemCandidateProcessor processor) {
int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
int previousAffix = outerSuffix >= 0 ? outerSuffix : prefixId;
int innerSuffix = -1;
if (prefix) {
if (outerPrefix < 0) outerPrefix = affix;
else innerPrefix = affix;
} else {
if (outerSuffix < 0) outerSuffix = affix;
else innerSuffix = affix;
}
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
int entryId = forms.ints[forms.offset + i];
if (dictionary.hasFlag(entryId, flag) || isFlagAppendedByAffix(prefixId, flag)) {
// confusing: in this one exception, we already chained the first prefix against the
// second,
// so it doesnt need to be checked against the word
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
if (!chainedPrefix && prefixId >= 0) {
char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
if (!dictionary.hasFlag(entryId, prefixFlag)
&& !isFlagAppendedByAffix(affix, prefixFlag)) {
continue;
}
}
if (!isRootCompatibleWithContext(context, affix, entryId)) {
continue;
}
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
if (!skipLookup
&& !processor.processStemCandidate(
word, offset, length, affix, outerPrefix, innerPrefix, outerSuffix, innerSuffix)) {
return false;
}
}
}
}
int recursionDepth =
(outerSuffix >= 0 ? 1 : 0) + (innerPrefix >= 0 ? 2 : outerPrefix >= 0 ? 1 : 0) - 1;
if (dictionary.isCrossProduct(affix) && recursionDepth <= 1) {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
boolean doPrefix;
if (recursionDepth == 0) {
if (prefix) {
prefixId = affix;
doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
@ -599,33 +634,42 @@ final class Stemmer {
return true;
}
} else {
doPrefix = false;
if (prefix && dictionary.complexPrefixes) {
prefixId = affix;
doPrefix = true;
// we took away the second prefix: go look for another suffix
} else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageSuffix(flag)) {
return true;
}
} else {
// we took away a prefix, then a suffix: go look for another suffix
doPrefix = false;
}
}
return stem(
strippedWord,
offset,
length,
context,
affix,
flag,
prefixId,
recursionDepth + 1,
doPrefix,
prefix,
processor);
return removeAffixes(
word, offset, length, doPrefix, outerPrefix, innerPrefix, outerSuffix, processor);
}
return true;
}
abstract static class StemCandidateProcessor {
private final WordContext context;
StemCandidateProcessor(WordContext context) {
this.context = context;
}
abstract boolean processStemCandidate(
char[] word,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix);
}
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
return false;
@ -633,39 +677,32 @@ final class Stemmer {
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
char cFlag = context.requiredFlag(dictionary);
return dictionary.hasFlag(entryId, cFlag)
|| isFlagAppendedByAffix(lastAffix, cFlag)
|| dictionary.isFlagAppendedByAffix(lastAffix, cFlag)
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
|| dictionary.isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
}
return true;
}
private boolean callProcessor(
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
CharsRef stem = new CharsRef(word, offset, length);
int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
private int morphDataId(IntsRef forms, int i) {
return dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
}
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
char circumfix = dictionary.circumfix;
// if circumfix was previously set by a prefix, we must check this suffix,
// to ensure it has it, and vice versa
if (isSuffix
&& isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
if (isSuffix) {
if (dictionary.isFlagAppendedByAffix(prefixId, circumfix)
!= dictionary.isFlagAppendedByAffix(affix, circumfix)) {
return true;
}
if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
}
if (dictionary.isFlagAppendedByAffix(affix, dictionary.needaffix)) {
return !isSuffix
|| previousAffix < 0
|| isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
|| dictionary.isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
}
return false;
}
private boolean isFlagAppendedByAffix(int affixId, char flag) {
if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
return dictionary.hasFlag(appendId, flag);
}
}

View File

@ -0,0 +1,487 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.hunspell.AffixedWord.Affix;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
/**
* A utility class used for generating possible word forms by adding affixes to stems ({@link
* #getAllWordForms(String, String, Runnable)}), and suggesting stems and flags to generate the
* given set of words ({@link #compress(List, Set, Runnable)}).
*/
public class WordFormGenerator {
private final Dictionary dictionary;
private final Map<Character, List<AffixEntry>> affixes = new HashMap<>();
private final Stemmer stemmer;
public WordFormGenerator(Dictionary dictionary) {
this.dictionary = dictionary;
fillAffixMap(dictionary.prefixes, AffixKind.PREFIX);
fillAffixMap(dictionary.suffixes, AffixKind.SUFFIX);
stemmer = new Stemmer(dictionary);
}
private void fillAffixMap(FST<IntsRef> fst, AffixKind kind) {
if (fst == null) return;
IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(fst);
try {
while (true) {
IntsRefFSTEnum.InputOutput<IntsRef> io = fstEnum.next();
if (io == null) break;
IntsRef affixIds = io.output;
for (int j = 0; j < affixIds.length; j++) {
int id = affixIds.ints[affixIds.offset + j];
char flag = dictionary.affixData(id, AFFIX_FLAG);
var entry =
new AffixEntry(id, flag, kind, toString(kind, io.input), strip(id), condition(id));
affixes.computeIfAbsent(flag, __ -> new ArrayList<>()).add(entry);
}
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
private String toString(AffixKind kind, IntsRef input) {
char[] affixChars = new char[input.length];
for (int i = 0; i < affixChars.length; i++) {
affixChars[kind == AffixKind.PREFIX ? i : affixChars.length - i - 1] =
(char) input.ints[input.offset + i];
}
return new String(affixChars);
}
private AffixCondition condition(int affixId) {
int condition = dictionary.getAffixCondition(affixId);
return condition == 0 ? AffixCondition.ALWAYS_TRUE : dictionary.patterns.get(condition);
}
private String strip(int affixId) {
int stripOrd = dictionary.affixData(affixId, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
return new String(dictionary.stripData, stripStart, stripEnd - stripStart);
}
/**
* Generate all word forms for all dictionary entries with the given root word. The result order
* is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
*
* @param checkCanceled an object that's periodically called, allowing to interrupt the generation
* by throwing an exception
*/
public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
Set<AffixedWord> result = new LinkedHashSet<>();
DictEntries entries = dictionary.lookupEntries(root);
if (entries != null) {
for (DictEntry entry : entries) {
result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
}
}
return new ArrayList<>(result);
}
/**
* Generate all word forms for the given root pretending it has the given flags (in the same
* format as the dictionary uses). The result order is stable but not specified. This is
* equivalent to "unmunch" from the "hunspell-tools" package.
*
* @param checkCanceled an object that's periodically called, allowing to interrupt the generation
* by throwing an exception
*/
public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
if (!shouldConsiderAtAll(encodedFlags)) return List.of();
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
checkCanceled.run();
if (!encodedFlags.contains(dictionary.needaffix)) {
result.add(bare);
}
result.addAll(expand(bare, encodedFlags, checkCanceled));
return new ArrayList<>(result);
}
private boolean canStemToOriginal(AffixedWord derived) {
String word = derived.getWord();
char[] chars = word.toCharArray();
if (isForbiddenWord(chars, 0, chars.length)) {
return false;
}
String stem = derived.getDictEntry().getStem();
var processor =
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
boolean foundStem = false;
boolean foundForbidden = false;
@Override
boolean processStemCandidate(
char[] chars,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix) {
if (isForbiddenWord(chars, offset, length)) {
foundForbidden = true;
return false;
}
foundStem |= length == stem.length() && stem.equals(new String(chars, offset, length));
return !foundStem;
}
};
stemmer.removeAffixes(chars, 0, chars.length, true, -1, -1, -1, processor);
return processor.foundStem && !processor.foundForbidden;
}
private boolean isForbiddenWord(char[] chars, int offset, int length) {
if (dictionary.forbiddenword != FLAG_UNSET) {
IntsRef forms = dictionary.lookupWord(chars, offset, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
if (dictionary.hasFlag(forms.ints[forms.offset + i], dictionary.forbiddenword)) {
return true;
}
}
}
}
return false;
}
private static LinkedHashSet<Character> toSet(char[] flags) {
LinkedHashSet<Character> set = new LinkedHashSet<>();
for (char c : flags) {
set.add(c);
}
return set;
}
private LinkedHashSet<AffixedWord> expand(
AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
for (Character flag : flags) {
List<AffixEntry> entries = affixes.get(flag);
if (entries == null) continue;
for (AffixEntry affix : entries) {
checkCanceled.run();
AffixedWord derived = affix.apply(stem, dictionary);
if (derived != null) {
LinkedHashSet<Character> append = appendFlags(affix);
if (shouldConsiderAtAll(append)) {
if (canStemToOriginal(derived)) {
result.add(derived);
}
if (dictionary.isCrossProduct(affix.id)) {
result.addAll(expand(derived, updateFlags(flags, flag, append), checkCanceled));
}
}
}
}
}
return result;
}
private boolean shouldConsiderAtAll(Set<Character> flags) {
return !flags.contains(dictionary.compoundBegin)
&& !flags.contains(dictionary.compoundMiddle)
&& !flags.contains(dictionary.compoundEnd)
&& !flags.contains(dictionary.forbiddenword)
&& !flags.contains(dictionary.onlyincompound);
}
private LinkedHashSet<Character> updateFlags(
Set<Character> flags, Character toRemove, Set<Character> toAppend) {
LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
copy.remove(toRemove);
copy.addAll(toAppend);
return copy;
}
private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
}
/**
* Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
* that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
* package. The algorithm tries to minimize the number of the dictionary entries to add or change,
* the number of flags involved, and the number of non-requested additionally generated words. All
* the mentioned words are in the dictionary format and case: no ICONV/OCONV/IGNORE conversions
* are applied.
*
* @param words the list of words to generate
* @param forbidden the set of words to avoid generating
* @param checkCanceled an object that's periodically called, allowing to interrupt the generation
* by throwing an exception
* @return the information about suggested dictionary entries and overgenerated words, or {@code
* null} if the algorithm couldn't generate anything
*/
public EntrySuggestion compress(
List<String> words, Set<String> forbidden, Runnable checkCanceled) {
if (words.isEmpty()) return null;
if (words.stream().anyMatch(forbidden::contains)) {
throw new IllegalArgumentException("'words' and 'forbidden' shouldn't intersect");
}
return new WordCompressor(words, forbidden, checkCanceled).compress();
}
private record AffixEntry(
int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
String word = stem.getWord();
boolean isPrefix = kind == AffixKind.PREFIX;
if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
String stripped =
isPrefix
? word.substring(strip.length())
: word.substring(0, word.length() - strip.length());
if (!condition.acceptsStem(stripped)) return null;
String applied = isPrefix ? affix + stripped : stripped + affix;
List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
(isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
}
private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
boolean isPrefix = kind == AffixKind.PREFIX;
List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
if (sameAffixes.size() == 2) return false;
if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
if (sameAffixes.size() == 1
&& !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
return false;
}
return true;
}
}
private class WordCompressor {
private final Comparator<State> solutionFitness =
Comparator.comparingInt((State s) -> s.forbidden)
.thenComparingInt(s -> s.underGenerated)
.thenComparingInt(s -> s.stemToFlags.size())
.thenComparingInt(s -> s.overGenerated);
private final Set<String> forbidden;
private final Runnable checkCanceled;
private final Set<String> wordSet;
private final Set<String> existingStems;
private final Map<String, Set<FlagSet>> stemToPossibleFlags = new HashMap<>();
private final Map<String, Integer> stemCounts = new LinkedHashMap<>();
WordCompressor(List<String> words, Set<String> forbidden, Runnable checkCanceled) {
this.forbidden = forbidden;
this.checkCanceled = checkCanceled;
wordSet = new HashSet<>(words);
Stemmer.StemCandidateProcessor processor =
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
@Override
boolean processStemCandidate(
char[] word,
int offset,
int length,
int lastAffix,
int outerPrefix,
int innerPrefix,
int outerSuffix,
int innerSuffix) {
String candidate = new String(word, offset, length);
stemCounts.merge(candidate, 1, Integer::sum);
Set<Character> flags = new LinkedHashSet<>();
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
stemToPossibleFlags
.computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
.add(new FlagSet(flags, dictionary));
return true;
}
};
for (String word : words) {
checkCanceled.run();
stemCounts.merge(word, 1, Integer::sum);
stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor);
}
existingStems =
stemCounts.keySet().stream()
.filter(stem -> dictionary.lookupEntries(stem) != null)
.collect(Collectors.toSet());
}
EntrySuggestion compress() {
Comparator<String> stemSorter =
Comparator.comparing((String s) -> existingStems.contains(s))
.thenComparing(stemCounts::get)
.reversed();
List<String> sortedStems = stemCounts.keySet().stream().sorted(stemSorter).toList();
PriorityQueue<State> queue = new PriorityQueue<>(solutionFitness);
queue.offer(new State(Map.of(), wordSet.size(), 0, 0));
State result = null;
while (!queue.isEmpty()) {
State state = queue.poll();
if (state.underGenerated == 0) {
if (result == null || solutionFitness.compare(state, result) < 0) result = state;
if (state.forbidden == 0) break;
continue;
}
for (String stem : sortedStems) {
if (!state.stemToFlags.containsKey(stem)) {
queue.offer(addStem(state, stem));
}
}
for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) {
if (!entry.getValue().contains(flags)) {
queue.offer(addFlags(state, entry.getKey(), flags));
}
}
}
}
return result == null ? null : toSuggestion(result);
}
EntrySuggestion toSuggestion(State state) {
List<DictEntry> toEdit = new ArrayList<>();
List<DictEntry> toAdd = new ArrayList<>();
for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
addEntry(toEdit, toAdd, entry.getKey(), FlagSet.flatten(entry.getValue()));
}
List<String> extraGenerated = new ArrayList<>();
for (String extra : allGenerated(state.stemToFlags).distinct().sorted().toList()) {
if (wordSet.contains(extra)) continue;
if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
addEntry(toEdit, toAdd, extra, Set.of(dictionary.forbiddenword));
} else {
extraGenerated.add(extra);
}
}
return new EntrySuggestion(toEdit, toAdd, extraGenerated);
}
private void addEntry(
List<DictEntry> toEdit, List<DictEntry> toAdd, String stem, Set<Character> flags) {
String flagString = toFlagString(flags);
(existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
}
private State addStem(State state, String stem) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
stemToFlags.put(stem, Set.of());
return newState(stemToFlags);
}
private State addFlags(State state, String stem, FlagSet flags) {
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
Set<FlagSet> flagSets = new LinkedHashSet<>(stemToFlags.get(stem));
flagSets.add(flags);
stemToFlags.put(stem, flagSets);
return newState(stemToFlags);
}
private State newState(Map<String, Set<FlagSet>> stemToFlags) {
Set<String> allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet());
return new State(
stemToFlags,
(int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(),
(int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(),
(int) allGenerated.stream().filter(s -> forbidden.contains(s)).count());
}
private final Map<StemWithFlags, List<String>> expansionCache = new HashMap<>();
private record StemWithFlags(String stem, Set<FlagSet> flags) {}
private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
Function<StemWithFlags, List<String>> expandToWords =
e -> expand(e.stem, FlagSet.flatten(e.flags)).stream().map(w -> w.getWord()).toList();
return stemToFlags.entrySet().stream()
.map(e -> new StemWithFlags(e.getKey(), e.getValue()))
.flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
}
private List<AffixedWord> expand(String stem, Set<Character> flagSet) {
return getAllWordForms(stem, toFlagString(flagSet), checkCanceled);
}
private String toFlagString(Set<Character> flagSet) {
return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flagSet));
}
}
private record FlagSet(Set<Character> flags, Dictionary dictionary) {
static Set<Character> flatten(Set<FlagSet> flagSets) {
return flagSets.stream().flatMap(f -> f.flags.stream()).collect(Collectors.toSet());
}
@Override
public String toString() {
return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flags));
}
}
private record State(
Map<String, Set<FlagSet>> stemToFlags,
int underGenerated,
int overGenerated,
int forbidden) {}
}

View File

@ -28,6 +28,7 @@ import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
@ -275,7 +276,9 @@ public class TestDictionary extends LuceneTestCase {
DictEntries simpleNoun = dic.lookupEntries("simplenoun");
assertEquals(1, simpleNoun.size());
assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:"));
assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
assertEquals(List.of("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
assertEquals(List.of("42"), simpleNoun.get(0).getMorphologicalValues("fr:"));
assertEquals("A", simpleNoun.get(0).getFlags());
DictEntries lay = dic.lookupEntries("lay");
String actual =

View File

@ -24,8 +24,13 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
import java.io.IOException;
import java.text.ParseException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Test;
@ -72,9 +77,134 @@ public class TestHunspell extends LuceneTestCase {
@Test
public void testStemmingApi() throws Exception {
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
Hunspell hunspell = loadNoTimeout("simple");
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
}
@Test
public void testAnalysisApi() throws Exception {
Hunspell hunspell = loadNoTimeout("base");
assertEquals(hunspell.analyzeSimpleWord("nonexistent"), List.of());
AffixedWord word = hunspell.analyzeSimpleWord("recreated").get(0);
checkAffixedWord(word, "create", List.of("A"), List.of("D"));
}
@Test
public void testAnalysisSeveralSuffixes() throws Exception {
Hunspell hunspell = loadNoTimeout("needaffix5");
AffixedWord word = hunspell.analyzeSimpleWord("pseudoprefoopseudosufbar").get(0);
checkAffixedWord(word, "foo", List.of("C"), List.of("B", "A"));
}
@Test
public void testAnalysisFlagLong() throws Exception {
AffixedWord word = loadNoTimeout("flaglong").analyzeSimpleWord("foos").get(0);
checkAffixedWord(word, "foo", List.of(), List.of("Y1"));
}
@Test
public void testAnalysisFlagNum() throws Exception {
AffixedWord word = loadNoTimeout("flagnum").analyzeSimpleWord("foos").get(0);
checkAffixedWord(word, "foo", List.of(), List.of("65000"));
}
@Test
public void testAnalysisMorphData() throws Exception {
List<AffixedWord> words = loadNoTimeout("morphdata").analyzeSimpleWord("works");
assertEquals(2, words.size());
AffixedWord verb =
words.get(words.get(0).getDictEntry().getMorphologicalData().contains("verb") ? 0 : 1);
AffixedWord noun = words.get(words.get(0) != verb ? 0 : 1);
assertNotNull(verb);
assertNotNull(noun);
checkAffixedWord(verb, "work", List.of(), List.of("A"));
checkAffixedWord(noun, "work", List.of(), List.of("B"));
assertEquals(List.of("worknoun"), noun.getDictEntry().getMorphologicalValues("st:"));
assertEquals(List.of("workverb"), verb.getDictEntry().getMorphologicalValues("st:"));
assertEquals("st:worknoun", noun.getDictEntry().getMorphologicalData());
assertEquals("st:workverb", verb.getDictEntry().getMorphologicalData());
}
private void checkAffixedWord(
AffixedWord word, String stem, List<String> prefixFlags, List<String> suffixFlags) {
assertEquals(stem, word.getDictEntry().getStem());
assertEquals(prefixFlags, word.getPrefixes().stream().map(AffixedWord.Affix::getFlag).toList());
assertEquals(suffixFlags, word.getSuffixes().stream().map(AffixedWord.Affix::getFlag).toList());
}
private Hunspell loadNoTimeout(String name) throws Exception {
Dictionary dictionary = loadDictionary(false, name + ".aff", name + ".dic");
return new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
}
@Test
public void testExpandRootApi() throws Exception {
Hunspell h = loadNoTimeout("base");
String[] createFormsBase = {
"create", "created", "creates", "creating", "creation", "creations"
};
List<String> expected =
Stream.concat(
Stream.of(createFormsBase).flatMap(s -> Stream.of(s, "pro" + s, "re" + s)),
Stream.of("creative"))
.sorted()
.toList();
Map<String, AffixedWord> expanded =
TestSpellChecking.checkExpansionGeneratesCorrectWords(h, "create", "base").stream()
.collect(Collectors.toMap(w -> w.getWord(), w -> w));
assertEquals(expected, expanded.keySet().stream().sorted().toList());
checkAffixedWord(expanded.get("created"), "create", List.of(), List.of("D"));
checkAffixedWord(expanded.get("recreated"), "create", List.of("A"), List.of("D"));
WordFormGenerator generator = new WordFormGenerator(h.dictionary);
List<AffixedWord> overrideFlag = generator.getAllWordForms("create", "U", () -> {});
assertEquals(
Set.of("create", "uncreate"),
overrideFlag.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
List<AffixedWord> nonExistentRoot = generator.getAllWordForms("form", "S", () -> {});
assertEquals(
Set.of("form", "forms"),
nonExistentRoot.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
}
@Test
public void testCompressingApi() throws Exception {
Hunspell h = loadNoTimeout("base");
String[] createQuery = {"create", "created", "creates", "creating", "creation"};
checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery);
checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created");
checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations");
checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def");
checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms");
checkCompression(
loadNoTimeout("compress"), "toEdit=[], toAdd=[form/X], extra=[forms]", "form", "formx");
}
@Test
public void testCompressingIsMinimal() throws Exception {
Hunspell h = loadNoTimeout("compress");
checkCompression(
h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms");
}
@Test
public void testCompressingWithProhibition() throws Exception {
WordFormGenerator gen = new WordFormGenerator(loadNoTimeout("compress").dictionary);
assertEquals(
"toEdit=[], toAdd=[form/S], extra=[]",
gen.compress(List.of("form", "forms"), Set.of("formx"), () -> {}).internalsToString());
assertEquals(
"toEdit=[], toAdd=[form, formx], extra=[]",
gen.compress(List.of("form", "formx"), Set.of("forms"), () -> {}).internalsToString());
}
private void checkCompression(Hunspell h, String expected, String... words) {
assertEquals(expected, h.compress(List.of(words)).internalsToString());
}
}

View File

@ -21,8 +21,12 @@ import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.IOUtils;
@ -97,6 +101,10 @@ public class TestSpellChecking extends LuceneTestCase {
doTest("compoundflag");
}
public void testFlagUtf8() throws Exception {
doTest("flagutf8");
}
public void testCheckCompoundCase() throws Exception {
doTest("checkcompoundcase");
}
@ -230,13 +238,15 @@ public class TestSpellChecking extends LuceneTestCase {
}
protected void doTest(String name) throws Exception {
//noinspection ConstantConditions
checkSpellCheckerExpectations(
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name));
}
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
InputStream dictStream = Files.newInputStream(Path.of(basePath.toString() + ".dic"));
Path dicFile = Path.of(basePath + ".dic");
InputStream dictStream = Files.newInputStream(dicFile);
Hunspell speller;
try {
@ -273,5 +283,80 @@ public class TestSpellChecking extends LuceneTestCase {
} else {
assertFalse(".sug file without .wrong file!", Files.exists(sug));
}
Set<String> everythingGenerated = expandWholeDictionary(dicFile, speller);
if (everythingGenerated != null && !speller.dictionary.mayNeedInputCleaning()) {
checkGoodSugWordsAreGenerated(speller, good, sug, everythingGenerated);
}
}
private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
Set<String> everythingGenerated = new HashSet<>();
boolean generatedEverything = true;
try (Stream<String> lines = Files.lines(dic, speller.dictionary.decoder.charset())) {
for (String line : lines.skip(1).toList()) {
int len = (int) line.chars().takeWhile(c -> !Character.isWhitespace(c) && c != '/').count();
String word = line.substring(0, len).trim();
if (word.isEmpty() || word.contains("\\")) {
generatedEverything = false;
continue;
}
List<AffixedWord> expanded =
checkExpansionGeneratesCorrectWords(speller, word, dic.toString());
expanded.forEach(w -> everythingGenerated.add(w.getWord().toLowerCase(Locale.ROOT)));
}
}
return generatedEverything ? everythingGenerated : null;
}
private static void checkGoodSugWordsAreGenerated(
Hunspell speller, Path good, Path sug, Set<String> everythingGenerated) throws IOException {
Set<String> goodWords = new HashSet<>();
if (Files.exists(good)) {
Files.readAllLines(good).stream().map(String::trim).forEach(goodWords::add);
}
if (Files.exists(sug)) {
Files.readAllLines(sug).stream()
.flatMap(line -> Stream.of(line.split(", ")))
.map(String::trim)
.filter(s -> !s.contains(" "))
.forEach(goodWords::add);
}
goodWords.removeAll(everythingGenerated);
goodWords.removeIf(s -> !s.equals(s.toLowerCase(Locale.ROOT)));
goodWords.removeIf(s -> speller.analyzeSimpleWord(s).isEmpty());
assertTrue("Some *.good/sug words weren't generated: " + goodWords, goodWords.isEmpty());
}
static List<AffixedWord> checkExpansionGeneratesCorrectWords(
Hunspell hunspell, String stem, String baseName) {
List<AffixedWord> expanded = hunspell.getAllWordForms(stem);
Set<AffixedWord> misspelled = new HashSet<>();
for (AffixedWord word : expanded) {
if (!hunspell.spell(word.getWord()) || hunspell.analyzeSimpleWord(word.getWord()).isEmpty()) {
misspelled.add(word);
}
}
if (!misspelled.isEmpty()) {
fail("Misspelled words generated in " + baseName + ": " + misspelled);
}
if (expanded.stream().anyMatch(e -> e.getWord().equals(stem))) {
EntrySuggestion suggestion =
hunspell.compress(expanded.stream().map(AffixedWord::getWord).toList());
if (suggestion != null) {
String message =
("Compression suggests a different stem from the original " + stem)
+ (" in " + baseName + ":" + suggestion);
assertTrue(
message,
suggestion.getEntriesToEdit().stream().anyMatch(e -> e.getStem().equals(stem)));
}
}
return expanded;
}
}

View File

@ -0,0 +1,14 @@
FORBIDDENWORD *
SFX G Y 1
SFX G 0 ing/S .
SFX J Y 1
SFX J 0 ings .
SFX S Y 1
SFX S 0 s .
SFX X Y 2
SFX X 0 s .
SFX X 0 x .

View File

@ -0,0 +1,15 @@
# UTF-8 flags
FLAG UTF-8
SFX A Y 1
SFX A 0 s/ÖüÜ .
#SFX A 0 s/ÖüÖÜ .
SFX Ö Y 1
SFX Ö 0 bar .
SFX ü Y 1
SFX ü 0 baz .
PFX Ü Y 1
PFX Ü 0 un .

View File

@ -0,0 +1,8 @@
foo
foos
foosbar
foosbaz
unfoo
unfoos
unfoosbar
unfoosbaz

View File

@ -9,3 +9,6 @@ COMPOUNDFLAG Y
SFX A Y 1
SFX A 0 s .
SFX s N 1
SFX s 0 os .

View File

@ -1,4 +1,4 @@
11
14
foo/S
foo/YX
bar/YS
@ -11,3 +11,5 @@ cm
Cm/X
SIPS/X
Sip/A
iPod/s
iPodos/X