mirror of https://github.com/apache/lucene.git
LUCENE-10626: Hunspell: add tools to aid dictionary editing: analysis introspection, stem expansion and stem/flag suggestion (#975)
This commit is contained in:
parent
3dd9a5487c
commit
d537013e70
|
@ -36,6 +36,9 @@ New Features
|
|||
|
||||
* LUCENE-10151 Enable timeout support in IndexSearcher. (Deepika Sharma)
|
||||
|
||||
* LUCENE-10626 Hunspell: add tools to aid dictionary editing:
|
||||
analysis introspection, stem expansion and stem/flag suggestion (Peter Gromov)
|
||||
|
||||
Improvements
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
/** An object representing the analysis result of a simple (non-compound) word */
|
||||
public final class AffixedWord {
|
||||
private final String word;
|
||||
private final DictEntry entry;
|
||||
private final List<Affix> prefixes;
|
||||
private final List<Affix> suffixes;
|
||||
|
||||
AffixedWord(String word, DictEntry entry, List<Affix> prefixes, List<Affix> suffixes) {
|
||||
this.word = word;
|
||||
this.entry = entry;
|
||||
this.prefixes = Collections.unmodifiableList(prefixes);
|
||||
this.suffixes = Collections.unmodifiableList(suffixes);
|
||||
}
|
||||
|
||||
/** @return the word being analyzed */
|
||||
public String getWord() {
|
||||
return word;
|
||||
}
|
||||
|
||||
/** @return the dictionary entry for the stem in this analysis */
|
||||
public DictEntry getDictEntry() {
|
||||
return entry;
|
||||
}
|
||||
|
||||
/** @return the list of prefixes applied to the stem, at most two, outermost first */
|
||||
public List<Affix> getPrefixes() {
|
||||
return prefixes;
|
||||
}
|
||||
|
||||
/** @return the list of suffixes applied to the stem, at most two, outermost first */
|
||||
public List<Affix> getSuffixes() {
|
||||
return suffixes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof AffixedWord that)) return false;
|
||||
return word.equals(that.word)
|
||||
&& entry.equals(that.entry)
|
||||
&& prefixes.equals(that.prefixes)
|
||||
&& suffixes.equals(that.suffixes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(word, entry, prefixes, suffixes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "AffixedWord["
|
||||
+ ("word=" + word + ", ")
|
||||
+ ("entry=" + entry + ", ")
|
||||
+ ("prefixes=" + prefixes + ", ")
|
||||
+ ("suffixes=" + suffixes)
|
||||
+ "]";
|
||||
}
|
||||
|
||||
/** An object representing a prefix or a suffix applied to a word stem */
|
||||
public static final class Affix {
|
||||
final int affixId;
|
||||
private final String presentableFlag;
|
||||
|
||||
Affix(Dictionary dictionary, int affixId) {
|
||||
this.affixId = affixId;
|
||||
char encodedFlag = dictionary.affixData(affixId, AFFIX_FLAG);
|
||||
presentableFlag = dictionary.flagParsingStrategy.printFlag(encodedFlag);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the corresponding affix flag as it appears in the *.aff file. Depending on the
|
||||
* format, it could be a Unicode character, two ASCII characters, or an integer in decimal
|
||||
* form
|
||||
*/
|
||||
public String getFlag() {
|
||||
return presentableFlag;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
return this == o || o instanceof Affix a && affixId == a.affixId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return affixId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return presentableFlag + "(id=" + affixId + ")";
|
||||
}
|
||||
}
|
||||
}
|
|
@ -24,26 +24,22 @@ import java.util.List;
|
|||
*
|
||||
* @see Dictionary#lookupEntries
|
||||
*/
|
||||
public interface DictEntries {
|
||||
public interface DictEntries extends List<DictEntry> {
|
||||
/**
|
||||
* @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
|
||||
* there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
|
||||
* be passed into other methods of this class.
|
||||
*/
|
||||
@Override
|
||||
int size();
|
||||
|
||||
/**
|
||||
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
|
||||
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
|
||||
* {@code ph:}) associated with the homonym at the given entry index, or an empty string
|
||||
*/
|
||||
String getMorphologicalData(int entryIndex);
|
||||
|
||||
/**
|
||||
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
|
||||
* @param key the key in the form {@code kk:} by which to filter the morphological fields
|
||||
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key
|
||||
* associated with the homonym at the given entry index
|
||||
*/
|
||||
List<String> getMorphologicalValues(int entryIndex, String key);
|
||||
/** Same as {@code get(entryIndex).getMorphologicalData()} */
|
||||
default String getMorphologicalData(int entryIndex) {
|
||||
return get(entryIndex).getMorphologicalData();
|
||||
}
|
||||
|
||||
/** Same as {@code get(entryIndex).getMorphologicalValues(key)} */
|
||||
default List<String> getMorphologicalValues(int entryIndex, String key) {
|
||||
return get(entryIndex).getMorphologicalValues(key);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
/** An object representing *.dic file entry with its word, flags and morphological data. */
|
||||
public abstract class DictEntry {
|
||||
private final String stem;
|
||||
|
||||
DictEntry(String stem) {
|
||||
this.stem = stem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
String result = stem;
|
||||
String flags = getFlags();
|
||||
if (!flags.isEmpty()) {
|
||||
result += "/" + flags;
|
||||
}
|
||||
String morph = getMorphologicalData();
|
||||
if (!morph.isEmpty()) {
|
||||
result += " " + morph;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof DictEntry that)) return false;
|
||||
return stem.equals(that.stem)
|
||||
&& getMorphologicalData().equals(that.getMorphologicalData())
|
||||
&& getFlags().equals(that.getFlags());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(stem, getFlags(), getMorphologicalData());
|
||||
}
|
||||
|
||||
/** @return the stem word in the dictionary */
|
||||
public String getStem() {
|
||||
return stem;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the flags associated with the dictionary entry, encoded in the same format as in the
|
||||
* *.dic file, but possibly in a different order
|
||||
*/
|
||||
public abstract String getFlags();
|
||||
|
||||
/**
|
||||
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
|
||||
* {@code ph:}) associated with the homonym at the given entry index, or an empty string
|
||||
*/
|
||||
public abstract String getMorphologicalData();
|
||||
|
||||
/**
|
||||
* @param key the key in the form {@code kk:} by which to filter the morphological fields
|
||||
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key
|
||||
* associated with the homonym at the given entry index
|
||||
*/
|
||||
public List<String> getMorphologicalValues(String key) {
|
||||
assert key.length() == 3 && key.charAt(2) == ':'
|
||||
: "A morphological data key should consist of two letters followed by a semicolon, found: "
|
||||
+ key;
|
||||
|
||||
String data = getMorphologicalData();
|
||||
if (data.isEmpty() || !data.contains(key)) return Collections.emptyList();
|
||||
|
||||
return Arrays.stream(data.split(" "))
|
||||
.filter(s -> s.startsWith(key))
|
||||
.map(s -> s.substring(3))
|
||||
.toList();
|
||||
}
|
||||
|
||||
static DictEntry create(String stem, String flags) {
|
||||
return new DictEntry(stem) {
|
||||
@Override
|
||||
public String getFlags() {
|
||||
return flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMorphologicalData() {
|
||||
return "";
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -34,6 +34,7 @@ import java.nio.file.Files;
|
|||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.ParseException;
|
||||
import java.util.AbstractList;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
@ -537,31 +538,33 @@ public class Dictionary {
|
|||
IntsRef forms = lookupWord(root.toCharArray(), 0, root.length());
|
||||
if (forms == null) return null;
|
||||
|
||||
return new DictEntries() {
|
||||
class DictEntriesImpl extends AbstractList<DictEntry> implements DictEntries {
|
||||
@Override
|
||||
public int size() {
|
||||
return forms.length / (hasCustomMorphData ? 2 : 1);
|
||||
return forms.length / formStep();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getMorphologicalData(int entryIndex) {
|
||||
if (!hasCustomMorphData) return "";
|
||||
return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]);
|
||||
public DictEntry get(int entryIndex) {
|
||||
return dictEntry(
|
||||
root,
|
||||
forms.ints[forms.offset + (entryIndex * formStep())],
|
||||
hasCustomMorphData ? forms.ints[forms.offset + entryIndex * 2 + 1] : 0);
|
||||
}
|
||||
}
|
||||
return new DictEntriesImpl();
|
||||
}
|
||||
|
||||
DictEntry dictEntry(String root, int flagId, int morphDataId) {
|
||||
return new DictEntry(root) {
|
||||
@Override
|
||||
public String getFlags() {
|
||||
return Dictionary.this.flagParsingStrategy.printFlags(flagLookup.getFlags(flagId));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getMorphologicalValues(int entryIndex, String key) {
|
||||
assert key.length() == 3 && key.charAt(2) == ':'
|
||||
: "A morphological data key should consist of two letters followed by a semicolon, found: "
|
||||
+ key;
|
||||
|
||||
String fields = getMorphologicalData(entryIndex);
|
||||
if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList();
|
||||
|
||||
return Arrays.stream(fields.split(" "))
|
||||
.filter(s -> s.startsWith(key))
|
||||
.map(s -> s.substring(3))
|
||||
.collect(Collectors.toList());
|
||||
public String getMorphologicalData() {
|
||||
return morphDataId == 0 ? "" : morphData.get(morphDataId);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -1153,7 +1156,7 @@ public class Dictionary {
|
|||
} else {
|
||||
end = line.indexOf(MORPH_SEPARATOR);
|
||||
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
||||
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end);
|
||||
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end).strip();
|
||||
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
@ -1327,6 +1330,12 @@ public class Dictionary {
|
|||
return false;
|
||||
}
|
||||
|
||||
boolean isFlagAppendedByAffix(int affixId, char flag) {
|
||||
if (affixId < 0 || flag == FLAG_UNSET) return false;
|
||||
int appendId = affixData(affixId, AFFIX_APPEND);
|
||||
return hasFlag(appendId, flag);
|
||||
}
|
||||
|
||||
/** Abstraction of the process of parsing flags taken from the affix and dic files */
|
||||
abstract static class FlagParsingStrategy {
|
||||
// we don't check the flag count, as Hunspell accepts longer sequences
|
||||
|
@ -1354,6 +1363,27 @@ public class Dictionary {
|
|||
* @return Parsed flags
|
||||
*/
|
||||
abstract char[] parseFlags(String rawFlags);
|
||||
|
||||
/**
|
||||
* @return the original string representation of the given flag encoded by {@link #parseFlags}.
|
||||
*/
|
||||
abstract String printFlag(char flag);
|
||||
|
||||
/** @return a presentable sorted concatenation of {@link #printFlag} results */
|
||||
String printFlags(char[] encodedFlags) {
|
||||
List<String> printed = new ArrayList<>();
|
||||
for (char c : encodedFlags) {
|
||||
if (c >= DEFAULT_FLAGS) continue;
|
||||
printed.add(printFlag(c));
|
||||
}
|
||||
String delimiter = this instanceof NumFlagParsingStrategy ? "," : "";
|
||||
return printed.stream().sorted().collect(Collectors.joining(delimiter));
|
||||
}
|
||||
|
||||
/** Parse flags from a string resulting from {@link #printFlags} */
|
||||
char[] parseUtfFlags(String flagsInUtf) {
|
||||
return parseFlags(flagsInUtf);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1365,6 +1395,11 @@ public class Dictionary {
|
|||
public char[] parseFlags(String rawFlags) {
|
||||
return rawFlags.toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
String printFlag(char flag) {
|
||||
return String.valueOf(flag);
|
||||
}
|
||||
}
|
||||
|
||||
/** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */
|
||||
|
@ -1373,6 +1408,16 @@ public class Dictionary {
|
|||
public char[] parseFlags(String rawFlags) {
|
||||
return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
String printFlag(char flag) {
|
||||
return String.valueOf(flag);
|
||||
}
|
||||
|
||||
@Override
|
||||
char[] parseUtfFlags(String flagsInUtf) {
|
||||
return flagsInUtf.toCharArray();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1403,6 +1448,11 @@ public class Dictionary {
|
|||
|
||||
return result.toString().toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
String printFlag(char flag) {
|
||||
return String.valueOf((int) flag);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1430,6 +1480,11 @@ public class Dictionary {
|
|||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
String printFlag(char flag) {
|
||||
return new String(new char[] {(char) ((flag & 0xff00) >>> 8), (char) (flag & 0xff)});
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasFlag(int entryId, char flag) {
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Suggestion to add/edit dictionary entries to generate a given list of words created by {@link
|
||||
* WordFormGenerator#compress}.
|
||||
*/
|
||||
public class EntrySuggestion {
|
||||
private final List<DictEntry> toEdit, toAdd;
|
||||
private final List<String> extraGenerated;
|
||||
|
||||
EntrySuggestion(List<DictEntry> toEdit, List<DictEntry> toAdd, List<String> extraGenerated) {
|
||||
this.toEdit = Collections.unmodifiableList(toEdit);
|
||||
this.toAdd = Collections.unmodifiableList(toAdd);
|
||||
this.extraGenerated = Collections.unmodifiableList(extraGenerated);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the existing dictionary entries whose flags would need changing to accommodate the
|
||||
* given word list
|
||||
*/
|
||||
public List<DictEntry> getEntriesToEdit() {
|
||||
return toEdit;
|
||||
}
|
||||
|
||||
/** @return new dictionary entries to be added to accommodate the given word list */
|
||||
public List<DictEntry> getEntriesToAdd() {
|
||||
return toAdd;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return additional words generated by union of {@link #getEntriesToAdd()} and {@link
|
||||
* #getEntriesToEdit()} which weren't in the given list of words
|
||||
*/
|
||||
public List<String> getExtraGeneratedWords() {
|
||||
return extraGenerated;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "EntrySuggestion{" + internalsToString() + '}';
|
||||
}
|
||||
|
||||
String internalsToString() {
|
||||
return "toEdit=" + toEdit + ", toAdd=" + toAdd + ", extra=" + extraGenerated;
|
||||
}
|
||||
}
|
|
@ -176,7 +176,7 @@ public class Hunspell {
|
|||
offset,
|
||||
length,
|
||||
context,
|
||||
(stem, formID, morphDataId) -> {
|
||||
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
|
||||
if (checkCase && !acceptCase(originalCase, formID, stem)) {
|
||||
return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
|
||||
}
|
||||
|
@ -314,6 +314,52 @@ public class Hunspell {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* @return all possible analyses of the given word with stems, prefixes, suffixed and
|
||||
* morphological data. Note that the order of the returned objects might not correspond to the
|
||||
* *.dic file order!
|
||||
*/
|
||||
public List<AffixedWord> analyzeSimpleWord(String word) {
|
||||
List<AffixedWord> result = new ArrayList<>();
|
||||
stemmer.analyze(
|
||||
word.toCharArray(),
|
||||
word.length(),
|
||||
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
|
||||
List<AffixedWord.Affix> prefixes = new ArrayList<>();
|
||||
List<AffixedWord.Affix> suffixes = new ArrayList<>();
|
||||
if (outerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, outerPrefix));
|
||||
if (innerPrefix >= 0) prefixes.add(new AffixedWord.Affix(dictionary, innerPrefix));
|
||||
if (outerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, outerSuffix));
|
||||
if (innerSuffix >= 0) suffixes.add(new AffixedWord.Affix(dictionary, innerSuffix));
|
||||
|
||||
DictEntry entry = dictionary.dictEntry(stem.toString(), formID, morphDataId);
|
||||
result.add(new AffixedWord(word, entry, prefixes, suffixes));
|
||||
return true;
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate all word forms for all dictionary entries with the given root word. The result order
|
||||
* is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
|
||||
*
|
||||
* @see WordFormGenerator for finer-grained APIs
|
||||
*/
|
||||
public List<AffixedWord> getAllWordForms(String root) {
|
||||
return new WordFormGenerator(dictionary).getAllWordForms(root, checkCanceled);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
|
||||
* that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
|
||||
* package.
|
||||
*
|
||||
* @see WordFormGenerator#compress for more details and control
|
||||
*/
|
||||
public EntrySuggestion compress(List<String> words) {
|
||||
return new WordFormGenerator(dictionary).compress(words, Set.of(), checkCanceled);
|
||||
}
|
||||
|
||||
private class CompoundPart {
|
||||
final CompoundPart prev;
|
||||
final int index, length;
|
||||
|
@ -431,7 +477,7 @@ public class Hunspell {
|
|||
words.add(ref);
|
||||
|
||||
Stemmer.RootProcessor stopOnMatching =
|
||||
(stem, formID, morphDataId) -> {
|
||||
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
|
||||
ref.ints[0] = formID;
|
||||
return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
|
||||
};
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.hunspell;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -65,7 +64,18 @@ final class Stemmer {
|
|||
* @return List of stems for the word
|
||||
*/
|
||||
public List<CharsRef> stem(char[] word, int length) {
|
||||
List<CharsRef> list = new ArrayList<>();
|
||||
analyze(
|
||||
word,
|
||||
length,
|
||||
(stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
|
||||
list.add(newStem(stem, morphDataId));
|
||||
return true;
|
||||
});
|
||||
return list;
|
||||
}
|
||||
|
||||
void analyze(char[] word, int length, RootProcessor processor) {
|
||||
if (dictionary.mayNeedInputCleaning()) {
|
||||
CharsRef scratchSegment = new CharsRef(word, 0, length);
|
||||
if (dictionary.needsInputCleaning(scratchSegment)) {
|
||||
|
@ -77,19 +87,12 @@ final class Stemmer {
|
|||
word = scratchBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
List<CharsRef> list = new ArrayList<>();
|
||||
if (length == 0) {
|
||||
return list;
|
||||
return;
|
||||
}
|
||||
|
||||
RootProcessor processor =
|
||||
(stem, formID, stemException) -> {
|
||||
list.add(newStem(stem, stemException));
|
||||
return true;
|
||||
};
|
||||
if (!doStem(word, 0, length, WordContext.SIMPLE_WORD, processor)) {
|
||||
return list;
|
||||
return;
|
||||
}
|
||||
|
||||
WordCase wordCase = caseOf(word, length);
|
||||
|
@ -99,7 +102,6 @@ final class Stemmer {
|
|||
doStem(variant, 0, varLength, WordContext.SIMPLE_WORD, processor);
|
||||
varyCase(word, length, wordCase, variationProcessor);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
interface CaseVariationProcessor {
|
||||
|
@ -214,7 +216,7 @@ final class Stemmer {
|
|||
if (result == null) return true;
|
||||
|
||||
String src = new String(word, 0, length);
|
||||
for (String s : result.collect(Collectors.toList())) {
|
||||
for (String s : result.toList()) {
|
||||
if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -239,13 +241,61 @@ final class Stemmer {
|
|||
if (!isRootCompatibleWithContext(context, -1, entryId)) {
|
||||
continue;
|
||||
}
|
||||
if (!callProcessor(word, offset, length, processor, forms, i)) {
|
||||
CharsRef charsRef = new CharsRef(word, offset, length);
|
||||
if (!processor.processRoot(charsRef, entryId, morphDataId(forms, i), -1, -1, -1, -1)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return stem(
|
||||
word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
|
||||
StemCandidateProcessor stemProcessor =
|
||||
new StemCandidateProcessor(context) {
|
||||
@Override
|
||||
boolean processStemCandidate(
|
||||
char[] word,
|
||||
int offset,
|
||||
int length,
|
||||
int lastAffix,
|
||||
int outerPrefix,
|
||||
int innerPrefix,
|
||||
int outerSuffix,
|
||||
int innerSuffix) {
|
||||
IntsRef forms = dictionary.lookupWord(word, offset, length);
|
||||
if (forms == null) return true;
|
||||
|
||||
char flag = dictionary.affixData(lastAffix, Dictionary.AFFIX_FLAG);
|
||||
int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
int entryId = forms.ints[forms.offset + i];
|
||||
if (dictionary.hasFlag(entryId, flag)
|
||||
|| dictionary.isFlagAppendedByAffix(prefixId, flag)) {
|
||||
if (innerPrefix < 0 && outerPrefix >= 0) {
|
||||
char prefixFlag = dictionary.affixData(outerPrefix, Dictionary.AFFIX_FLAG);
|
||||
if (!dictionary.hasFlag(entryId, prefixFlag)
|
||||
&& !dictionary.isFlagAppendedByAffix(lastAffix, prefixFlag)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isRootCompatibleWithContext(context, lastAffix, entryId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!processor.processRoot(
|
||||
new CharsRef(word, offset, length),
|
||||
entryId,
|
||||
morphDataId(forms, i),
|
||||
outerPrefix,
|
||||
innerPrefix,
|
||||
outerSuffix,
|
||||
innerSuffix)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
return removeAffixes(word, offset, length, true, -1, -1, -1, stemProcessor);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -277,9 +327,20 @@ final class Stemmer {
|
|||
* Dictionary#hasFlag(int, char)}
|
||||
* @param morphDataId the id of the custom morphological data (0 if none), to be used with
|
||||
* {@link Dictionary#morphData}
|
||||
* @param outerPrefix the id of the outer prefix applied to the stem, or -1 if none
|
||||
* @param innerPrefix the id of the inner prefix applied to the stem, or -1 if none
|
||||
* @param outerSuffix the id of the outer suffix applied to the stem, or -1 if none
|
||||
* @param innerSuffix the id of the inner suffix applied to the stem, or -1 if none
|
||||
* @return whether the processing should be continued
|
||||
*/
|
||||
boolean processRoot(CharsRef stem, int formID, int morphDataId);
|
||||
boolean processRoot(
|
||||
CharsRef stem,
|
||||
int formID,
|
||||
int morphDataId,
|
||||
int outerPrefix,
|
||||
int innerPrefix,
|
||||
int outerSuffix,
|
||||
int innerSuffix);
|
||||
}
|
||||
|
||||
private String stemException(int morphDataId) {
|
||||
|
@ -318,33 +379,23 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Generates a list of stems for the provided word
|
||||
* Generates a list of stems for the provided word. It's called recursively when applying affixes
|
||||
* one by one, setting {@code (inner/outer)(Suffix/Prefix)} parameters to non-negative values as
|
||||
* that happens.
|
||||
*
|
||||
* @param word Word to generate the stems for
|
||||
* @param previous previous affix that was removed (so we dont remove same one twice)
|
||||
* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
|
||||
* affixes in this recursive step
|
||||
* @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
|
||||
* checked against the word
|
||||
* @param recursionDepth current recursiondepth
|
||||
* @param doPrefix true if we should remove prefixes
|
||||
* @param previousWasPrefix true if the previous removal was a prefix: if we are removing a
|
||||
* suffix, and it has no continuation requirements, it's ok. but two prefixes
|
||||
* (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
|
||||
* @return whether the processing should be continued
|
||||
*/
|
||||
private boolean stem(
|
||||
boolean removeAffixes(
|
||||
char[] word,
|
||||
int offset,
|
||||
int length,
|
||||
WordContext context,
|
||||
int previous,
|
||||
char prevFlag,
|
||||
int prefixId,
|
||||
int recursionDepth,
|
||||
boolean doPrefix,
|
||||
boolean previousWasPrefix,
|
||||
RootProcessor processor) {
|
||||
int outerPrefix,
|
||||
int innerPrefix,
|
||||
int outerSuffix,
|
||||
StemCandidateProcessor processor) {
|
||||
FST.Arc<IntsRef> arc = new FST.Arc<>();
|
||||
if (doPrefix && dictionary.prefixes != null) {
|
||||
FST<IntsRef> fst = dictionary.prefixes;
|
||||
|
@ -366,11 +417,11 @@ final class Stemmer {
|
|||
|
||||
for (int j = 0; j < prefixes.length; j++) {
|
||||
int prefix = prefixes.ints[prefixes.offset + j];
|
||||
if (prefix == previous) {
|
||||
if (prefix == outerPrefix) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
|
||||
if (isAffixCompatible(prefix, true, outerPrefix, outerSuffix, processor.context)) {
|
||||
char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
|
||||
if (strippedWord == null) {
|
||||
continue;
|
||||
|
@ -381,12 +432,11 @@ final class Stemmer {
|
|||
strippedWord,
|
||||
pureAffix ? offset + i : 0,
|
||||
pureAffix ? length - i : strippedWord.length,
|
||||
context,
|
||||
prefix,
|
||||
previous,
|
||||
-1,
|
||||
recursionDepth,
|
||||
true,
|
||||
outerPrefix,
|
||||
innerPrefix,
|
||||
outerSuffix,
|
||||
processor)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -415,12 +465,11 @@ final class Stemmer {
|
|||
|
||||
for (int j = 0; j < suffixes.length; j++) {
|
||||
int suffix = suffixes.ints[suffixes.offset + j];
|
||||
if (suffix == previous) {
|
||||
if (suffix == outerSuffix) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isAffixCompatible(
|
||||
suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
|
||||
if (isAffixCompatible(suffix, false, outerPrefix, outerSuffix, processor.context)) {
|
||||
char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
|
||||
if (strippedWord == null) {
|
||||
continue;
|
||||
|
@ -431,12 +480,11 @@ final class Stemmer {
|
|||
strippedWord,
|
||||
pureAffix ? offset : 0,
|
||||
pureAffix ? i : strippedWord.length,
|
||||
context,
|
||||
suffix,
|
||||
previous,
|
||||
prefixId,
|
||||
recursionDepth,
|
||||
false,
|
||||
outerPrefix,
|
||||
innerPrefix,
|
||||
outerSuffix,
|
||||
processor)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -487,14 +535,10 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
private boolean isAffixCompatible(
|
||||
int affix,
|
||||
char prevFlag,
|
||||
int recursionDepth,
|
||||
boolean isPrefix,
|
||||
boolean previousWasPrefix,
|
||||
WordContext context) {
|
||||
int affix, boolean isPrefix, int outerPrefix, int outerSuffix, WordContext context) {
|
||||
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
boolean previousWasPrefix = outerSuffix < 0 && outerPrefix >= 0;
|
||||
if (context.isCompound()) {
|
||||
if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
|
||||
return false;
|
||||
|
@ -513,79 +557,70 @@ final class Stemmer {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (recursionDepth == 0) {
|
||||
if (outerPrefix == -1 && outerSuffix == -1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dictionary.isCrossProduct(affix)) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
|
||||
// cross-check incoming continuation class (flag of previous affix) against this affix's flags
|
||||
if (previousWasPrefix) return true;
|
||||
if (outerSuffix >= 0) {
|
||||
char prevFlag = dictionary.affixData(outerSuffix, Dictionary.AFFIX_FLAG);
|
||||
return dictionary.hasFlag(append, prevFlag);
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the affix rule to the given word, producing a list of stems if any are found
|
||||
* Applies the affix rule to the given word, producing a list of stems if any are found.
|
||||
* Non-negative {@code (inner/outer)(Suffix/Prefix)} parameters indicate the already applied
|
||||
* affixes.
|
||||
*
|
||||
* @param strippedWord Char array containing the word with the affix removed and the strip added
|
||||
* @param word Char array containing the word with the affix removed and the strip added
|
||||
* @param offset where the word actually starts in the array
|
||||
* @param length the length of the stripped word
|
||||
* @param affix HunspellAffix representing the affix rule itself
|
||||
* @param prefixId when we already stripped a prefix, we can't simply recurse and check the
|
||||
* suffix, unless both are compatible so we must check dictionary form against both to add it
|
||||
* as a stem!
|
||||
* @param recursionDepth current recursion depth
|
||||
* @param affix the id of the affix in {@link Dictionary#affixData}
|
||||
* @param prefix true if we are removing a prefix (false if it's a suffix)
|
||||
* @return whether the processing should be continued
|
||||
*/
|
||||
private boolean applyAffix(
|
||||
char[] strippedWord,
|
||||
char[] word,
|
||||
int offset,
|
||||
int length,
|
||||
WordContext context,
|
||||
int affix,
|
||||
int previousAffix,
|
||||
int prefixId,
|
||||
int recursionDepth,
|
||||
boolean prefix,
|
||||
RootProcessor processor) {
|
||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||
int outerPrefix,
|
||||
int innerPrefix,
|
||||
int outerSuffix,
|
||||
StemCandidateProcessor processor) {
|
||||
int prefixId = innerPrefix >= 0 ? innerPrefix : outerPrefix;
|
||||
int previousAffix = outerSuffix >= 0 ? outerSuffix : prefixId;
|
||||
|
||||
int innerSuffix = -1;
|
||||
if (prefix) {
|
||||
if (outerPrefix < 0) outerPrefix = affix;
|
||||
else innerPrefix = affix;
|
||||
} else {
|
||||
if (outerSuffix < 0) outerSuffix = affix;
|
||||
else innerSuffix = affix;
|
||||
}
|
||||
|
||||
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
|
||||
IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
|
||||
if (forms != null) {
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
int entryId = forms.ints[forms.offset + i];
|
||||
if (dictionary.hasFlag(entryId, flag) || isFlagAppendedByAffix(prefixId, flag)) {
|
||||
// confusing: in this one exception, we already chained the first prefix against the
|
||||
// second,
|
||||
// so it doesnt need to be checked against the word
|
||||
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
|
||||
if (!chainedPrefix && prefixId >= 0) {
|
||||
char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
|
||||
if (!dictionary.hasFlag(entryId, prefixFlag)
|
||||
&& !isFlagAppendedByAffix(affix, prefixFlag)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isRootCompatibleWithContext(context, affix, entryId)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
|
||||
if (!skipLookup
|
||||
&& !processor.processStemCandidate(
|
||||
word, offset, length, affix, outerPrefix, innerPrefix, outerSuffix, innerSuffix)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int recursionDepth =
|
||||
(outerSuffix >= 0 ? 1 : 0) + (innerPrefix >= 0 ? 2 : outerPrefix >= 0 ? 1 : 0) - 1;
|
||||
if (dictionary.isCrossProduct(affix) && recursionDepth <= 1) {
|
||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||
boolean doPrefix;
|
||||
if (recursionDepth == 0) {
|
||||
if (prefix) {
|
||||
prefixId = affix;
|
||||
doPrefix = dictionary.complexPrefixes && dictionary.isSecondStagePrefix(flag);
|
||||
// we took away the first prefix.
|
||||
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
|
||||
|
@ -599,33 +634,42 @@ final class Stemmer {
|
|||
return true;
|
||||
}
|
||||
} else {
|
||||
doPrefix = false;
|
||||
if (prefix && dictionary.complexPrefixes) {
|
||||
prefixId = affix;
|
||||
doPrefix = true;
|
||||
// we took away the second prefix: go look for another suffix
|
||||
} else if (prefix || dictionary.complexPrefixes || !dictionary.isSecondStageSuffix(flag)) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
// we took away a prefix, then a suffix: go look for another suffix
|
||||
doPrefix = false;
|
||||
}
|
||||
}
|
||||
|
||||
return stem(
|
||||
strippedWord,
|
||||
offset,
|
||||
length,
|
||||
context,
|
||||
affix,
|
||||
flag,
|
||||
prefixId,
|
||||
recursionDepth + 1,
|
||||
doPrefix,
|
||||
prefix,
|
||||
processor);
|
||||
return removeAffixes(
|
||||
word, offset, length, doPrefix, outerPrefix, innerPrefix, outerSuffix, processor);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
abstract static class StemCandidateProcessor {
|
||||
private final WordContext context;
|
||||
|
||||
StemCandidateProcessor(WordContext context) {
|
||||
this.context = context;
|
||||
}
|
||||
|
||||
abstract boolean processStemCandidate(
|
||||
char[] word,
|
||||
int offset,
|
||||
int length,
|
||||
int lastAffix,
|
||||
int outerPrefix,
|
||||
int innerPrefix,
|
||||
int outerSuffix,
|
||||
int innerSuffix);
|
||||
}
|
||||
|
||||
private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
|
||||
if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
|
||||
return false;
|
||||
|
@ -633,39 +677,32 @@ final class Stemmer {
|
|||
if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
|
||||
char cFlag = context.requiredFlag(dictionary);
|
||||
return dictionary.hasFlag(entryId, cFlag)
|
||||
|| isFlagAppendedByAffix(lastAffix, cFlag)
|
||||
|| dictionary.isFlagAppendedByAffix(lastAffix, cFlag)
|
||||
|| dictionary.hasFlag(entryId, dictionary.compoundFlag)
|
||||
|| isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
|
||||
|| dictionary.isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean callProcessor(
|
||||
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
|
||||
CharsRef stem = new CharsRef(word, offset, length);
|
||||
int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
|
||||
return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
|
||||
private int morphDataId(IntsRef forms, int i) {
|
||||
return dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
|
||||
}
|
||||
|
||||
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
|
||||
char circumfix = dictionary.circumfix;
|
||||
// if circumfix was previously set by a prefix, we must check this suffix,
|
||||
// to ensure it has it, and vice versa
|
||||
if (isSuffix
|
||||
&& isFlagAppendedByAffix(prefixId, circumfix) != isFlagAppendedByAffix(affix, circumfix)) {
|
||||
if (isSuffix) {
|
||||
if (dictionary.isFlagAppendedByAffix(prefixId, circumfix)
|
||||
!= dictionary.isFlagAppendedByAffix(affix, circumfix)) {
|
||||
return true;
|
||||
}
|
||||
if (isFlagAppendedByAffix(affix, dictionary.needaffix)) {
|
||||
}
|
||||
if (dictionary.isFlagAppendedByAffix(affix, dictionary.needaffix)) {
|
||||
return !isSuffix
|
||||
|| previousAffix < 0
|
||||
|| isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
|
||||
|| dictionary.isFlagAppendedByAffix(previousAffix, dictionary.needaffix);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isFlagAppendedByAffix(int affixId, char flag) {
|
||||
if (affixId < 0 || flag == Dictionary.FLAG_UNSET) return false;
|
||||
int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
|
||||
return dictionary.hasFlag(appendId, flag);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,487 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_APPEND;
|
||||
import static org.apache.lucene.analysis.hunspell.Dictionary.AFFIX_FLAG;
|
||||
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.analysis.hunspell.AffixedWord.Affix;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.IntsRefFSTEnum;
|
||||
|
||||
/**
|
||||
* A utility class used for generating possible word forms by adding affixes to stems ({@link
|
||||
* #getAllWordForms(String, String, Runnable)}), and suggesting stems and flags to generate the
|
||||
* given set of words ({@link #compress(List, Set, Runnable)}).
|
||||
*/
|
||||
public class WordFormGenerator {
|
||||
private final Dictionary dictionary;
|
||||
private final Map<Character, List<AffixEntry>> affixes = new HashMap<>();
|
||||
private final Stemmer stemmer;
|
||||
|
||||
public WordFormGenerator(Dictionary dictionary) {
|
||||
this.dictionary = dictionary;
|
||||
fillAffixMap(dictionary.prefixes, AffixKind.PREFIX);
|
||||
fillAffixMap(dictionary.suffixes, AffixKind.SUFFIX);
|
||||
stemmer = new Stemmer(dictionary);
|
||||
}
|
||||
|
||||
private void fillAffixMap(FST<IntsRef> fst, AffixKind kind) {
|
||||
if (fst == null) return;
|
||||
|
||||
IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(fst);
|
||||
try {
|
||||
while (true) {
|
||||
IntsRefFSTEnum.InputOutput<IntsRef> io = fstEnum.next();
|
||||
if (io == null) break;
|
||||
|
||||
IntsRef affixIds = io.output;
|
||||
for (int j = 0; j < affixIds.length; j++) {
|
||||
int id = affixIds.ints[affixIds.offset + j];
|
||||
char flag = dictionary.affixData(id, AFFIX_FLAG);
|
||||
var entry =
|
||||
new AffixEntry(id, flag, kind, toString(kind, io.input), strip(id), condition(id));
|
||||
affixes.computeIfAbsent(flag, __ -> new ArrayList<>()).add(entry);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private String toString(AffixKind kind, IntsRef input) {
|
||||
char[] affixChars = new char[input.length];
|
||||
for (int i = 0; i < affixChars.length; i++) {
|
||||
affixChars[kind == AffixKind.PREFIX ? i : affixChars.length - i - 1] =
|
||||
(char) input.ints[input.offset + i];
|
||||
}
|
||||
return new String(affixChars);
|
||||
}
|
||||
|
||||
private AffixCondition condition(int affixId) {
|
||||
int condition = dictionary.getAffixCondition(affixId);
|
||||
return condition == 0 ? AffixCondition.ALWAYS_TRUE : dictionary.patterns.get(condition);
|
||||
}
|
||||
|
||||
private String strip(int affixId) {
|
||||
int stripOrd = dictionary.affixData(affixId, Dictionary.AFFIX_STRIP_ORD);
|
||||
int stripStart = dictionary.stripOffsets[stripOrd];
|
||||
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
|
||||
return new String(dictionary.stripData, stripStart, stripEnd - stripStart);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate all word forms for all dictionary entries with the given root word. The result order
|
||||
* is stable but not specified. This is equivalent to "unmunch" from the "hunspell-tools" package.
|
||||
*
|
||||
* @param checkCanceled an object that's periodically called, allowing to interrupt the generation
|
||||
* by throwing an exception
|
||||
*/
|
||||
public List<AffixedWord> getAllWordForms(String root, Runnable checkCanceled) {
|
||||
Set<AffixedWord> result = new LinkedHashSet<>();
|
||||
DictEntries entries = dictionary.lookupEntries(root);
|
||||
if (entries != null) {
|
||||
for (DictEntry entry : entries) {
|
||||
result.addAll(getAllWordForms(root, entry.getFlags(), checkCanceled));
|
||||
}
|
||||
}
|
||||
return new ArrayList<>(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate all word forms for the given root pretending it has the given flags (in the same
|
||||
* format as the dictionary uses). The result order is stable but not specified. This is
|
||||
* equivalent to "unmunch" from the "hunspell-tools" package.
|
||||
*
|
||||
* @param checkCanceled an object that's periodically called, allowing to interrupt the generation
|
||||
* by throwing an exception
|
||||
*/
|
||||
public List<AffixedWord> getAllWordForms(String stem, String flags, Runnable checkCanceled) {
|
||||
var encodedFlags = toSet(dictionary.flagParsingStrategy.parseUtfFlags(flags));
|
||||
if (!shouldConsiderAtAll(encodedFlags)) return List.of();
|
||||
|
||||
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
|
||||
AffixedWord bare = new AffixedWord(stem, DictEntry.create(stem, flags), List.of(), List.of());
|
||||
checkCanceled.run();
|
||||
if (!encodedFlags.contains(dictionary.needaffix)) {
|
||||
result.add(bare);
|
||||
}
|
||||
result.addAll(expand(bare, encodedFlags, checkCanceled));
|
||||
return new ArrayList<>(result);
|
||||
}
|
||||
|
||||
private boolean canStemToOriginal(AffixedWord derived) {
|
||||
String word = derived.getWord();
|
||||
char[] chars = word.toCharArray();
|
||||
if (isForbiddenWord(chars, 0, chars.length)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String stem = derived.getDictEntry().getStem();
|
||||
var processor =
|
||||
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
|
||||
boolean foundStem = false;
|
||||
boolean foundForbidden = false;
|
||||
|
||||
@Override
|
||||
boolean processStemCandidate(
|
||||
char[] chars,
|
||||
int offset,
|
||||
int length,
|
||||
int lastAffix,
|
||||
int outerPrefix,
|
||||
int innerPrefix,
|
||||
int outerSuffix,
|
||||
int innerSuffix) {
|
||||
if (isForbiddenWord(chars, offset, length)) {
|
||||
foundForbidden = true;
|
||||
return false;
|
||||
}
|
||||
foundStem |= length == stem.length() && stem.equals(new String(chars, offset, length));
|
||||
return !foundStem;
|
||||
}
|
||||
};
|
||||
stemmer.removeAffixes(chars, 0, chars.length, true, -1, -1, -1, processor);
|
||||
return processor.foundStem && !processor.foundForbidden;
|
||||
}
|
||||
|
||||
private boolean isForbiddenWord(char[] chars, int offset, int length) {
|
||||
if (dictionary.forbiddenword != FLAG_UNSET) {
|
||||
IntsRef forms = dictionary.lookupWord(chars, offset, length);
|
||||
if (forms != null) {
|
||||
for (int i = 0; i < forms.length; i += dictionary.formStep()) {
|
||||
if (dictionary.hasFlag(forms.ints[forms.offset + i], dictionary.forbiddenword)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static LinkedHashSet<Character> toSet(char[] flags) {
|
||||
LinkedHashSet<Character> set = new LinkedHashSet<>();
|
||||
for (char c : flags) {
|
||||
set.add(c);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
private LinkedHashSet<AffixedWord> expand(
|
||||
AffixedWord stem, LinkedHashSet<Character> flags, Runnable checkCanceled) {
|
||||
LinkedHashSet<AffixedWord> result = new LinkedHashSet<>();
|
||||
for (Character flag : flags) {
|
||||
List<AffixEntry> entries = affixes.get(flag);
|
||||
if (entries == null) continue;
|
||||
|
||||
for (AffixEntry affix : entries) {
|
||||
checkCanceled.run();
|
||||
AffixedWord derived = affix.apply(stem, dictionary);
|
||||
if (derived != null) {
|
||||
LinkedHashSet<Character> append = appendFlags(affix);
|
||||
if (shouldConsiderAtAll(append)) {
|
||||
if (canStemToOriginal(derived)) {
|
||||
result.add(derived);
|
||||
}
|
||||
if (dictionary.isCrossProduct(affix.id)) {
|
||||
result.addAll(expand(derived, updateFlags(flags, flag, append), checkCanceled));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean shouldConsiderAtAll(Set<Character> flags) {
|
||||
return !flags.contains(dictionary.compoundBegin)
|
||||
&& !flags.contains(dictionary.compoundMiddle)
|
||||
&& !flags.contains(dictionary.compoundEnd)
|
||||
&& !flags.contains(dictionary.forbiddenword)
|
||||
&& !flags.contains(dictionary.onlyincompound);
|
||||
}
|
||||
|
||||
private LinkedHashSet<Character> updateFlags(
|
||||
Set<Character> flags, Character toRemove, Set<Character> toAppend) {
|
||||
LinkedHashSet<Character> copy = new LinkedHashSet<>(flags);
|
||||
copy.remove(toRemove);
|
||||
copy.addAll(toAppend);
|
||||
return copy;
|
||||
}
|
||||
|
||||
private LinkedHashSet<Character> appendFlags(AffixEntry affix) {
|
||||
char appendId = dictionary.affixData(affix.id, AFFIX_APPEND);
|
||||
return appendId <= 0 ? new LinkedHashSet<>() : toSet(dictionary.flagLookup.getFlags(appendId));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a list of words, try to produce a smaller set of dictionary entries (with some flags)
|
||||
* that would generate these words. This is equivalent to "munch" from the "hunspell-tools"
|
||||
* package. The algorithm tries to minimize the number of the dictionary entries to add or change,
|
||||
* the number of flags involved, and the number of non-requested additionally generated words. All
|
||||
* the mentioned words are in the dictionary format and case: no ICONV/OCONV/IGNORE conversions
|
||||
* are applied.
|
||||
*
|
||||
* @param words the list of words to generate
|
||||
* @param forbidden the set of words to avoid generating
|
||||
* @param checkCanceled an object that's periodically called, allowing to interrupt the generation
|
||||
* by throwing an exception
|
||||
* @return the information about suggested dictionary entries and overgenerated words, or {@code
|
||||
* null} if the algorithm couldn't generate anything
|
||||
*/
|
||||
public EntrySuggestion compress(
|
||||
List<String> words, Set<String> forbidden, Runnable checkCanceled) {
|
||||
if (words.isEmpty()) return null;
|
||||
if (words.stream().anyMatch(forbidden::contains)) {
|
||||
throw new IllegalArgumentException("'words' and 'forbidden' shouldn't intersect");
|
||||
}
|
||||
|
||||
return new WordCompressor(words, forbidden, checkCanceled).compress();
|
||||
}
|
||||
|
||||
private record AffixEntry(
|
||||
int id, char flag, AffixKind kind, String affix, String strip, AffixCondition condition) {
|
||||
AffixedWord apply(AffixedWord stem, Dictionary dictionary) {
|
||||
if (!isCompatibleWithPreviousAffixes(stem, dictionary)) return null;
|
||||
|
||||
String word = stem.getWord();
|
||||
boolean isPrefix = kind == AffixKind.PREFIX;
|
||||
if (!(isPrefix ? word.startsWith(strip) : word.endsWith(strip))) return null;
|
||||
|
||||
String stripped =
|
||||
isPrefix
|
||||
? word.substring(strip.length())
|
||||
: word.substring(0, word.length() - strip.length());
|
||||
if (!condition.acceptsStem(stripped)) return null;
|
||||
|
||||
String applied = isPrefix ? affix + stripped : stripped + affix;
|
||||
List<Affix> prefixes = new ArrayList<>(stem.getPrefixes());
|
||||
List<Affix> suffixes = new ArrayList<>(stem.getSuffixes());
|
||||
(isPrefix ? prefixes : suffixes).add(0, new Affix(dictionary, id));
|
||||
return new AffixedWord(applied, stem.getDictEntry(), prefixes, suffixes);
|
||||
}
|
||||
|
||||
private boolean isCompatibleWithPreviousAffixes(AffixedWord stem, Dictionary dictionary) {
|
||||
boolean isPrefix = kind == AffixKind.PREFIX;
|
||||
List<Affix> sameAffixes = isPrefix ? stem.getPrefixes() : stem.getSuffixes();
|
||||
if (sameAffixes.size() == 2) return false;
|
||||
if (isPrefix && sameAffixes.size() == 1 && !dictionary.complexPrefixes) return false;
|
||||
if (!isPrefix && !stem.getPrefixes().isEmpty()) return false;
|
||||
if (sameAffixes.size() == 1
|
||||
&& !dictionary.isFlagAppendedByAffix(sameAffixes.get(0).affixId, flag)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private class WordCompressor {
|
||||
private final Comparator<State> solutionFitness =
|
||||
Comparator.comparingInt((State s) -> s.forbidden)
|
||||
.thenComparingInt(s -> s.underGenerated)
|
||||
.thenComparingInt(s -> s.stemToFlags.size())
|
||||
.thenComparingInt(s -> s.overGenerated);
|
||||
private final Set<String> forbidden;
|
||||
private final Runnable checkCanceled;
|
||||
private final Set<String> wordSet;
|
||||
private final Set<String> existingStems;
|
||||
private final Map<String, Set<FlagSet>> stemToPossibleFlags = new HashMap<>();
|
||||
private final Map<String, Integer> stemCounts = new LinkedHashMap<>();
|
||||
|
||||
WordCompressor(List<String> words, Set<String> forbidden, Runnable checkCanceled) {
|
||||
this.forbidden = forbidden;
|
||||
this.checkCanceled = checkCanceled;
|
||||
wordSet = new HashSet<>(words);
|
||||
|
||||
Stemmer.StemCandidateProcessor processor =
|
||||
new Stemmer.StemCandidateProcessor(WordContext.SIMPLE_WORD) {
|
||||
@Override
|
||||
boolean processStemCandidate(
|
||||
char[] word,
|
||||
int offset,
|
||||
int length,
|
||||
int lastAffix,
|
||||
int outerPrefix,
|
||||
int innerPrefix,
|
||||
int outerSuffix,
|
||||
int innerSuffix) {
|
||||
String candidate = new String(word, offset, length);
|
||||
stemCounts.merge(candidate, 1, Integer::sum);
|
||||
Set<Character> flags = new LinkedHashSet<>();
|
||||
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
|
||||
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
|
||||
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
|
||||
if (innerSuffix >= 0) flags.add(dictionary.affixData(innerSuffix, AFFIX_FLAG));
|
||||
stemToPossibleFlags
|
||||
.computeIfAbsent(candidate, __ -> new LinkedHashSet<>())
|
||||
.add(new FlagSet(flags, dictionary));
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
for (String word : words) {
|
||||
checkCanceled.run();
|
||||
stemCounts.merge(word, 1, Integer::sum);
|
||||
stemToPossibleFlags.computeIfAbsent(word, __ -> new LinkedHashSet<>());
|
||||
stemmer.removeAffixes(word.toCharArray(), 0, word.length(), true, -1, -1, -1, processor);
|
||||
}
|
||||
|
||||
existingStems =
|
||||
stemCounts.keySet().stream()
|
||||
.filter(stem -> dictionary.lookupEntries(stem) != null)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
EntrySuggestion compress() {
|
||||
Comparator<String> stemSorter =
|
||||
Comparator.comparing((String s) -> existingStems.contains(s))
|
||||
.thenComparing(stemCounts::get)
|
||||
.reversed();
|
||||
List<String> sortedStems = stemCounts.keySet().stream().sorted(stemSorter).toList();
|
||||
PriorityQueue<State> queue = new PriorityQueue<>(solutionFitness);
|
||||
queue.offer(new State(Map.of(), wordSet.size(), 0, 0));
|
||||
State result = null;
|
||||
while (!queue.isEmpty()) {
|
||||
State state = queue.poll();
|
||||
if (state.underGenerated == 0) {
|
||||
if (result == null || solutionFitness.compare(state, result) < 0) result = state;
|
||||
if (state.forbidden == 0) break;
|
||||
continue;
|
||||
}
|
||||
|
||||
for (String stem : sortedStems) {
|
||||
if (!state.stemToFlags.containsKey(stem)) {
|
||||
queue.offer(addStem(state, stem));
|
||||
}
|
||||
}
|
||||
|
||||
for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
|
||||
for (FlagSet flags : stemToPossibleFlags.get(entry.getKey())) {
|
||||
if (!entry.getValue().contains(flags)) {
|
||||
queue.offer(addFlags(state, entry.getKey(), flags));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result == null ? null : toSuggestion(result);
|
||||
}
|
||||
|
||||
EntrySuggestion toSuggestion(State state) {
|
||||
List<DictEntry> toEdit = new ArrayList<>();
|
||||
List<DictEntry> toAdd = new ArrayList<>();
|
||||
for (Map.Entry<String, Set<FlagSet>> entry : state.stemToFlags.entrySet()) {
|
||||
addEntry(toEdit, toAdd, entry.getKey(), FlagSet.flatten(entry.getValue()));
|
||||
}
|
||||
|
||||
List<String> extraGenerated = new ArrayList<>();
|
||||
for (String extra : allGenerated(state.stemToFlags).distinct().sorted().toList()) {
|
||||
if (wordSet.contains(extra)) continue;
|
||||
|
||||
if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
|
||||
addEntry(toEdit, toAdd, extra, Set.of(dictionary.forbiddenword));
|
||||
} else {
|
||||
extraGenerated.add(extra);
|
||||
}
|
||||
}
|
||||
|
||||
return new EntrySuggestion(toEdit, toAdd, extraGenerated);
|
||||
}
|
||||
|
||||
private void addEntry(
|
||||
List<DictEntry> toEdit, List<DictEntry> toAdd, String stem, Set<Character> flags) {
|
||||
String flagString = toFlagString(flags);
|
||||
(existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
|
||||
}
|
||||
|
||||
private State addStem(State state, String stem) {
|
||||
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
|
||||
stemToFlags.put(stem, Set.of());
|
||||
return newState(stemToFlags);
|
||||
}
|
||||
|
||||
private State addFlags(State state, String stem, FlagSet flags) {
|
||||
LinkedHashMap<String, Set<FlagSet>> stemToFlags = new LinkedHashMap<>(state.stemToFlags);
|
||||
Set<FlagSet> flagSets = new LinkedHashSet<>(stemToFlags.get(stem));
|
||||
flagSets.add(flags);
|
||||
stemToFlags.put(stem, flagSets);
|
||||
return newState(stemToFlags);
|
||||
}
|
||||
|
||||
private State newState(Map<String, Set<FlagSet>> stemToFlags) {
|
||||
Set<String> allGenerated = allGenerated(stemToFlags).collect(Collectors.toSet());
|
||||
return new State(
|
||||
stemToFlags,
|
||||
(int) wordSet.stream().filter(s -> !allGenerated.contains(s)).count(),
|
||||
(int) allGenerated.stream().filter(s -> !wordSet.contains(s)).count(),
|
||||
(int) allGenerated.stream().filter(s -> forbidden.contains(s)).count());
|
||||
}
|
||||
|
||||
private final Map<StemWithFlags, List<String>> expansionCache = new HashMap<>();
|
||||
|
||||
private record StemWithFlags(String stem, Set<FlagSet> flags) {}
|
||||
|
||||
private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
|
||||
Function<StemWithFlags, List<String>> expandToWords =
|
||||
e -> expand(e.stem, FlagSet.flatten(e.flags)).stream().map(w -> w.getWord()).toList();
|
||||
return stemToFlags.entrySet().stream()
|
||||
.map(e -> new StemWithFlags(e.getKey(), e.getValue()))
|
||||
.flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
|
||||
}
|
||||
|
||||
private List<AffixedWord> expand(String stem, Set<Character> flagSet) {
|
||||
return getAllWordForms(stem, toFlagString(flagSet), checkCanceled);
|
||||
}
|
||||
|
||||
private String toFlagString(Set<Character> flagSet) {
|
||||
return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flagSet));
|
||||
}
|
||||
}
|
||||
|
||||
private record FlagSet(Set<Character> flags, Dictionary dictionary) {
|
||||
static Set<Character> flatten(Set<FlagSet> flagSets) {
|
||||
return flagSets.stream().flatMap(f -> f.flags.stream()).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flags));
|
||||
}
|
||||
}
|
||||
|
||||
private record State(
|
||||
Map<String, Set<FlagSet>> stemToFlags,
|
||||
int underGenerated,
|
||||
int overGenerated,
|
||||
int forbidden) {}
|
||||
}
|
|
@ -28,6 +28,7 @@ import java.text.ParseException;
|
|||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -275,7 +276,9 @@ public class TestDictionary extends LuceneTestCase {
|
|||
DictEntries simpleNoun = dic.lookupEntries("simplenoun");
|
||||
assertEquals(1, simpleNoun.size());
|
||||
assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:"));
|
||||
assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
|
||||
assertEquals(List.of("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
|
||||
assertEquals(List.of("42"), simpleNoun.get(0).getMorphologicalValues("fr:"));
|
||||
assertEquals("A", simpleNoun.get(0).getFlags());
|
||||
|
||||
DictEntries lay = dic.lookupEntries("lay");
|
||||
String actual =
|
||||
|
|
|
@ -24,8 +24,13 @@ import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
|
|||
import java.io.IOException;
|
||||
import java.text.ParseException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CancellationException;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -72,9 +77,134 @@ public class TestHunspell extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testStemmingApi() throws Exception {
|
||||
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
|
||||
Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||
Hunspell hunspell = loadNoTimeout("simple");
|
||||
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
|
||||
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnalysisApi() throws Exception {
|
||||
Hunspell hunspell = loadNoTimeout("base");
|
||||
assertEquals(hunspell.analyzeSimpleWord("nonexistent"), List.of());
|
||||
AffixedWord word = hunspell.analyzeSimpleWord("recreated").get(0);
|
||||
checkAffixedWord(word, "create", List.of("A"), List.of("D"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnalysisSeveralSuffixes() throws Exception {
|
||||
Hunspell hunspell = loadNoTimeout("needaffix5");
|
||||
AffixedWord word = hunspell.analyzeSimpleWord("pseudoprefoopseudosufbar").get(0);
|
||||
checkAffixedWord(word, "foo", List.of("C"), List.of("B", "A"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnalysisFlagLong() throws Exception {
|
||||
AffixedWord word = loadNoTimeout("flaglong").analyzeSimpleWord("foos").get(0);
|
||||
checkAffixedWord(word, "foo", List.of(), List.of("Y1"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnalysisFlagNum() throws Exception {
|
||||
AffixedWord word = loadNoTimeout("flagnum").analyzeSimpleWord("foos").get(0);
|
||||
checkAffixedWord(word, "foo", List.of(), List.of("65000"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnalysisMorphData() throws Exception {
|
||||
List<AffixedWord> words = loadNoTimeout("morphdata").analyzeSimpleWord("works");
|
||||
assertEquals(2, words.size());
|
||||
AffixedWord verb =
|
||||
words.get(words.get(0).getDictEntry().getMorphologicalData().contains("verb") ? 0 : 1);
|
||||
AffixedWord noun = words.get(words.get(0) != verb ? 0 : 1);
|
||||
assertNotNull(verb);
|
||||
assertNotNull(noun);
|
||||
checkAffixedWord(verb, "work", List.of(), List.of("A"));
|
||||
checkAffixedWord(noun, "work", List.of(), List.of("B"));
|
||||
|
||||
assertEquals(List.of("worknoun"), noun.getDictEntry().getMorphologicalValues("st:"));
|
||||
assertEquals(List.of("workverb"), verb.getDictEntry().getMorphologicalValues("st:"));
|
||||
assertEquals("st:worknoun", noun.getDictEntry().getMorphologicalData());
|
||||
assertEquals("st:workverb", verb.getDictEntry().getMorphologicalData());
|
||||
}
|
||||
|
||||
private void checkAffixedWord(
|
||||
AffixedWord word, String stem, List<String> prefixFlags, List<String> suffixFlags) {
|
||||
assertEquals(stem, word.getDictEntry().getStem());
|
||||
assertEquals(prefixFlags, word.getPrefixes().stream().map(AffixedWord.Affix::getFlag).toList());
|
||||
assertEquals(suffixFlags, word.getSuffixes().stream().map(AffixedWord.Affix::getFlag).toList());
|
||||
}
|
||||
|
||||
private Hunspell loadNoTimeout(String name) throws Exception {
|
||||
Dictionary dictionary = loadDictionary(false, name + ".aff", name + ".dic");
|
||||
return new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpandRootApi() throws Exception {
|
||||
Hunspell h = loadNoTimeout("base");
|
||||
String[] createFormsBase = {
|
||||
"create", "created", "creates", "creating", "creation", "creations"
|
||||
};
|
||||
List<String> expected =
|
||||
Stream.concat(
|
||||
Stream.of(createFormsBase).flatMap(s -> Stream.of(s, "pro" + s, "re" + s)),
|
||||
Stream.of("creative"))
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
Map<String, AffixedWord> expanded =
|
||||
TestSpellChecking.checkExpansionGeneratesCorrectWords(h, "create", "base").stream()
|
||||
.collect(Collectors.toMap(w -> w.getWord(), w -> w));
|
||||
assertEquals(expected, expanded.keySet().stream().sorted().toList());
|
||||
|
||||
checkAffixedWord(expanded.get("created"), "create", List.of(), List.of("D"));
|
||||
checkAffixedWord(expanded.get("recreated"), "create", List.of("A"), List.of("D"));
|
||||
|
||||
WordFormGenerator generator = new WordFormGenerator(h.dictionary);
|
||||
List<AffixedWord> overrideFlag = generator.getAllWordForms("create", "U", () -> {});
|
||||
assertEquals(
|
||||
Set.of("create", "uncreate"),
|
||||
overrideFlag.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
|
||||
|
||||
List<AffixedWord> nonExistentRoot = generator.getAllWordForms("form", "S", () -> {});
|
||||
assertEquals(
|
||||
Set.of("form", "forms"),
|
||||
nonExistentRoot.stream().map(w -> w.getWord()).collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompressingApi() throws Exception {
|
||||
Hunspell h = loadNoTimeout("base");
|
||||
String[] createQuery = {"create", "created", "creates", "creating", "creation"};
|
||||
checkCompression(h, "toEdit=[create/DGNS], toAdd=[], extra=[]", createQuery);
|
||||
checkCompression(h, "toEdit=[created], toAdd=[creates], extra=[]", "creates", "created");
|
||||
checkCompression(h, "toEdit=[], toAdd=[creation/S], extra=[]", "creation", "creations");
|
||||
checkCompression(h, "toEdit=[], toAdd=[abc, def], extra=[]", "abc", "def");
|
||||
checkCompression(h, "toEdit=[], toAdd=[form/S], extra=[]", "form", "forms");
|
||||
|
||||
checkCompression(
|
||||
loadNoTimeout("compress"), "toEdit=[], toAdd=[form/X], extra=[forms]", "form", "formx");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompressingIsMinimal() throws Exception {
|
||||
Hunspell h = loadNoTimeout("compress");
|
||||
checkCompression(
|
||||
h, "toEdit=[], toAdd=[form/GS], extra=[]", "formings", "forming", "form", "forms");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompressingWithProhibition() throws Exception {
|
||||
WordFormGenerator gen = new WordFormGenerator(loadNoTimeout("compress").dictionary);
|
||||
assertEquals(
|
||||
"toEdit=[], toAdd=[form/S], extra=[]",
|
||||
gen.compress(List.of("form", "forms"), Set.of("formx"), () -> {}).internalsToString());
|
||||
assertEquals(
|
||||
"toEdit=[], toAdd=[form, formx], extra=[]",
|
||||
gen.compress(List.of("form", "formx"), Set.of("forms"), () -> {}).internalsToString());
|
||||
}
|
||||
|
||||
private void checkCompression(Hunspell h, String expected, String... words) {
|
||||
assertEquals(expected, h.compress(List.of(words)).internalsToString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,8 +21,12 @@ import java.io.InputStream;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.text.ParseException;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -97,6 +101,10 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
doTest("compoundflag");
|
||||
}
|
||||
|
||||
public void testFlagUtf8() throws Exception {
|
||||
doTest("flagutf8");
|
||||
}
|
||||
|
||||
public void testCheckCompoundCase() throws Exception {
|
||||
doTest("checkcompoundcase");
|
||||
}
|
||||
|
@ -230,13 +238,15 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
}
|
||||
|
||||
protected void doTest(String name) throws Exception {
|
||||
//noinspection ConstantConditions
|
||||
checkSpellCheckerExpectations(
|
||||
Path.of(getClass().getResource(name + ".aff").toURI()).getParent().resolve(name));
|
||||
}
|
||||
|
||||
static void checkSpellCheckerExpectations(Path basePath) throws IOException, ParseException {
|
||||
InputStream affixStream = Files.newInputStream(Path.of(basePath.toString() + ".aff"));
|
||||
InputStream dictStream = Files.newInputStream(Path.of(basePath.toString() + ".dic"));
|
||||
Path dicFile = Path.of(basePath + ".dic");
|
||||
InputStream dictStream = Files.newInputStream(dicFile);
|
||||
|
||||
Hunspell speller;
|
||||
try {
|
||||
|
@ -273,5 +283,80 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
} else {
|
||||
assertFalse(".sug file without .wrong file!", Files.exists(sug));
|
||||
}
|
||||
|
||||
Set<String> everythingGenerated = expandWholeDictionary(dicFile, speller);
|
||||
if (everythingGenerated != null && !speller.dictionary.mayNeedInputCleaning()) {
|
||||
checkGoodSugWordsAreGenerated(speller, good, sug, everythingGenerated);
|
||||
}
|
||||
}
|
||||
|
||||
private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
|
||||
Set<String> everythingGenerated = new HashSet<>();
|
||||
boolean generatedEverything = true;
|
||||
try (Stream<String> lines = Files.lines(dic, speller.dictionary.decoder.charset())) {
|
||||
for (String line : lines.skip(1).toList()) {
|
||||
int len = (int) line.chars().takeWhile(c -> !Character.isWhitespace(c) && c != '/').count();
|
||||
String word = line.substring(0, len).trim();
|
||||
if (word.isEmpty() || word.contains("\\")) {
|
||||
generatedEverything = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
List<AffixedWord> expanded =
|
||||
checkExpansionGeneratesCorrectWords(speller, word, dic.toString());
|
||||
expanded.forEach(w -> everythingGenerated.add(w.getWord().toLowerCase(Locale.ROOT)));
|
||||
}
|
||||
}
|
||||
return generatedEverything ? everythingGenerated : null;
|
||||
}
|
||||
|
||||
private static void checkGoodSugWordsAreGenerated(
|
||||
Hunspell speller, Path good, Path sug, Set<String> everythingGenerated) throws IOException {
|
||||
Set<String> goodWords = new HashSet<>();
|
||||
if (Files.exists(good)) {
|
||||
Files.readAllLines(good).stream().map(String::trim).forEach(goodWords::add);
|
||||
}
|
||||
if (Files.exists(sug)) {
|
||||
Files.readAllLines(sug).stream()
|
||||
.flatMap(line -> Stream.of(line.split(", ")))
|
||||
.map(String::trim)
|
||||
.filter(s -> !s.contains(" "))
|
||||
.forEach(goodWords::add);
|
||||
}
|
||||
|
||||
goodWords.removeAll(everythingGenerated);
|
||||
goodWords.removeIf(s -> !s.equals(s.toLowerCase(Locale.ROOT)));
|
||||
goodWords.removeIf(s -> speller.analyzeSimpleWord(s).isEmpty());
|
||||
|
||||
assertTrue("Some *.good/sug words weren't generated: " + goodWords, goodWords.isEmpty());
|
||||
}
|
||||
|
||||
static List<AffixedWord> checkExpansionGeneratesCorrectWords(
|
||||
Hunspell hunspell, String stem, String baseName) {
|
||||
List<AffixedWord> expanded = hunspell.getAllWordForms(stem);
|
||||
Set<AffixedWord> misspelled = new HashSet<>();
|
||||
for (AffixedWord word : expanded) {
|
||||
if (!hunspell.spell(word.getWord()) || hunspell.analyzeSimpleWord(word.getWord()).isEmpty()) {
|
||||
misspelled.add(word);
|
||||
}
|
||||
}
|
||||
if (!misspelled.isEmpty()) {
|
||||
fail("Misspelled words generated in " + baseName + ": " + misspelled);
|
||||
}
|
||||
|
||||
if (expanded.stream().anyMatch(e -> e.getWord().equals(stem))) {
|
||||
EntrySuggestion suggestion =
|
||||
hunspell.compress(expanded.stream().map(AffixedWord::getWord).toList());
|
||||
if (suggestion != null) {
|
||||
String message =
|
||||
("Compression suggests a different stem from the original " + stem)
|
||||
+ (" in " + baseName + ":" + suggestion);
|
||||
assertTrue(
|
||||
message,
|
||||
suggestion.getEntriesToEdit().stream().anyMatch(e -> e.getStem().equals(stem)));
|
||||
}
|
||||
}
|
||||
|
||||
return expanded;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
FORBIDDENWORD *
|
||||
|
||||
SFX G Y 1
|
||||
SFX G 0 ing/S .
|
||||
|
||||
SFX J Y 1
|
||||
SFX J 0 ings .
|
||||
|
||||
SFX S Y 1
|
||||
SFX S 0 s .
|
||||
|
||||
SFX X Y 2
|
||||
SFX X 0 s .
|
||||
SFX X 0 x .
|
|
@ -0,0 +1,2 @@
|
|||
1
|
||||
word
|
|
@ -0,0 +1,15 @@
|
|||
# UTF-8 flags
|
||||
FLAG UTF-8
|
||||
|
||||
SFX A Y 1
|
||||
SFX A 0 s/ÖüÜ .
|
||||
#SFX A 0 s/ÖüÖÜ .
|
||||
|
||||
SFX Ö Y 1
|
||||
SFX Ö 0 bar .
|
||||
|
||||
SFX ü Y 1
|
||||
SFX ü 0 baz .
|
||||
|
||||
PFX Ü Y 1
|
||||
PFX Ü 0 un .
|
|
@ -0,0 +1,2 @@
|
|||
1
|
||||
foo/AÜ
|
|
@ -0,0 +1,8 @@
|
|||
foo
|
||||
foos
|
||||
foosbar
|
||||
foosbaz
|
||||
unfoo
|
||||
unfoos
|
||||
unfoosbar
|
||||
unfoosbaz
|
|
@ -9,3 +9,6 @@ COMPOUNDFLAG Y
|
|||
|
||||
SFX A Y 1
|
||||
SFX A 0 s .
|
||||
|
||||
SFX s N 1
|
||||
SFX s 0 os .
|
|
@ -1,4 +1,4 @@
|
|||
11
|
||||
14
|
||||
foo/S
|
||||
foo/YX
|
||||
bar/YS
|
||||
|
@ -11,3 +11,5 @@ cm
|
|||
Cm/X
|
||||
SIPS/X
|
||||
Sip/A
|
||||
iPod/s
|
||||
iPodos/X
|
Loading…
Reference in New Issue