mirror of https://github.com/apache/lucene.git
LUCENE-9766: Hunspell: add API for retrieving dictionary morphologica… (#2363)
This commit is contained in:
parent
ee447d1516
commit
f1a1165ac8
|
@ -0,0 +1,49 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An object representing homonym dictionary entries. Note that the order of entries here may differ
|
||||||
|
* from the order in the *.dic file!
|
||||||
|
*
|
||||||
|
* @see Dictionary#lookupEntries
|
||||||
|
*/
|
||||||
|
public interface DictEntries {
|
||||||
|
/**
|
||||||
|
* @return a positive number of dictionary entries with the same word. Most often it's 1 (unless
|
||||||
|
* there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can
|
||||||
|
* be passed into other methods of this class.
|
||||||
|
*/
|
||||||
|
int size();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
|
||||||
|
* @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding
|
||||||
|
* {@code ph:}) associated with the homonym at the given entry index, or an empty string
|
||||||
|
*/
|
||||||
|
String getMorphologicalData(int entryIndex);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive)
|
||||||
|
* @param key the key in the form {@code kk:} by which to filter the morphological fields
|
||||||
|
* @return the values (of {@code vvvvvv} form) of morphological fields with the given key
|
||||||
|
* associated with the homonym at the given entry index
|
||||||
|
*/
|
||||||
|
List<String> getMorphologicalValues(int entryIndex, String key);
|
||||||
|
}
|
|
@ -44,7 +44,6 @@ import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
@ -83,7 +82,6 @@ public class Dictionary {
|
||||||
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
||||||
private static final String PREFIX_CONDITION_REGEX = "%s.*";
|
private static final String PREFIX_CONDITION_REGEX = "%s.*";
|
||||||
private static final String SUFFIX_CONDITION_REGEX = ".*%s";
|
private static final String SUFFIX_CONDITION_REGEX = ".*%s";
|
||||||
private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)");
|
|
||||||
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
|
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
|
||||||
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
|
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
|
||||||
|
|
||||||
|
@ -136,15 +134,13 @@ public class Dictionary {
|
||||||
private String[] morphAliases;
|
private String[] morphAliases;
|
||||||
private int morphAliasCount = 0;
|
private int morphAliasCount = 0;
|
||||||
|
|
||||||
// st: morphological entries (either directly, or aliased from AM)
|
final List<String> morphData = new ArrayList<>(Collections.singletonList("")); // empty data at 0
|
||||||
private String[] stemExceptions = new String[8];
|
|
||||||
private int stemExceptionCount = 0;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* we set this during sorting, so we know to add an extra FST output. when set, some words have
|
* we set this during sorting, so we know to add an extra int (index in {@link #morphData}) to FST
|
||||||
* exceptional stems, and the last entry is a pointer to stemExceptions
|
* output
|
||||||
*/
|
*/
|
||||||
boolean hasStemExceptions;
|
boolean hasCustomMorphData;
|
||||||
|
|
||||||
boolean ignoreCase;
|
boolean ignoreCase;
|
||||||
boolean checkSharpS;
|
boolean checkSharpS;
|
||||||
|
@ -274,7 +270,7 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
int formStep() {
|
int formStep() {
|
||||||
return hasStemExceptions ? 2 : 1;
|
return hasCustomMorphData ? 2 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Looks up Hunspell word forms from the dictionary */
|
/** Looks up Hunspell word forms from the dictionary */
|
||||||
|
@ -543,6 +539,44 @@ public class Dictionary {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param root a string to look up in the dictionary. No case conversion or affix removal is
|
||||||
|
* performed. To get the possible roots of any word, you may call {@link
|
||||||
|
* Hunspell#getRoots(String)}
|
||||||
|
* @return the dictionary entries for the given root, or {@code null} if there's none
|
||||||
|
*/
|
||||||
|
public DictEntries lookupEntries(String root) {
|
||||||
|
IntsRef forms = lookupWord(root.toCharArray(), 0, root.length());
|
||||||
|
if (forms == null) return null;
|
||||||
|
|
||||||
|
return new DictEntries() {
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return forms.length / (hasCustomMorphData ? 2 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getMorphologicalData(int entryIndex) {
|
||||||
|
if (!hasCustomMorphData) return "";
|
||||||
|
return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> getMorphologicalValues(int entryIndex, String key) {
|
||||||
|
assert key.length() == 3;
|
||||||
|
assert key.charAt(2) == ':';
|
||||||
|
|
||||||
|
String fields = getMorphologicalData(entryIndex);
|
||||||
|
if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList();
|
||||||
|
|
||||||
|
return Arrays.stream(fields.split(" "))
|
||||||
|
.filter(s -> s.startsWith(key))
|
||||||
|
.map(s -> s.substring(3))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
static String extractLanguageCode(String isoCode) {
|
static String extractLanguageCode(String isoCode) {
|
||||||
int underscore = isoCode.indexOf("_");
|
int underscore = isoCode.indexOf("_");
|
||||||
return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
|
return underscore < 0 ? isoCode : isoCode.substring(0, underscore);
|
||||||
|
@ -1024,11 +1058,13 @@ public class Dictionary {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
line = unescapeEntry(line);
|
line = unescapeEntry(line);
|
||||||
// if we havent seen any stem exceptions, try to parse one
|
// if we haven't seen any custom morphological data, try to parse one
|
||||||
if (!hasStemExceptions) {
|
if (!hasCustomMorphData) {
|
||||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||||
if (morphStart >= 0 && morphStart < line.length()) {
|
if (morphStart >= 0 && morphStart < line.length()) {
|
||||||
hasStemExceptions = hasStemException(line.substring(morphStart + 1));
|
String data = line.substring(morphStart + 1);
|
||||||
|
hasCustomMorphData =
|
||||||
|
splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1156,6 +1192,8 @@ public class Dictionary {
|
||||||
Directory tempDir, String sorted, FlagEnumerator flags) throws IOException {
|
Directory tempDir, String sorted, FlagEnumerator flags) throws IOException {
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
|
|
||||||
|
Map<String, Integer> morphIndices = new HashMap<>();
|
||||||
|
|
||||||
EntryGrouper grouper = new EntryGrouper(flags);
|
EntryGrouper grouper = new EntryGrouper(flags);
|
||||||
|
|
||||||
try (ByteSequencesReader reader =
|
try (ByteSequencesReader reader =
|
||||||
|
@ -1195,20 +1233,17 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
entry = line.substring(0, flagSep);
|
entry = line.substring(0, flagSep);
|
||||||
}
|
}
|
||||||
// we possibly have morphological data
|
|
||||||
int stemExceptionID = 0;
|
int morphDataID = 0;
|
||||||
if (end + 1 < line.length()) {
|
if (end + 1 < line.length()) {
|
||||||
String morphData = line.substring(end + 1);
|
List<String> morphFields = readMorphFields(entry, line.substring(end + 1));
|
||||||
for (String datum : splitMorphData(morphData)) {
|
if (!morphFields.isEmpty()) {
|
||||||
if (datum.startsWith("st:")) {
|
morphFields.sort(Comparator.naturalOrder());
|
||||||
stemExceptionID = addStemException(datum.substring(3));
|
morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields));
|
||||||
} else if (datum.startsWith("ph:") && datum.length() > 3) {
|
|
||||||
addPhoneticRepEntries(entry, datum.substring(3));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
grouper.add(entry, wordForm, stemExceptionID);
|
grouper.add(entry, wordForm, morphDataID);
|
||||||
}
|
}
|
||||||
|
|
||||||
// finalize last entry
|
// finalize last entry
|
||||||
|
@ -1224,10 +1259,29 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private int addStemException(String stemException) {
|
private List<String> readMorphFields(String word, String unparsed) {
|
||||||
stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
|
List<String> morphFields = null;
|
||||||
stemExceptions[stemExceptionCount++] = stemException;
|
for (String datum : splitMorphData(unparsed)) {
|
||||||
return stemExceptionCount; // we use '0' to indicate no exception for the form
|
if (datum.startsWith("ph:")) {
|
||||||
|
addPhoneticRepEntries(word, datum.substring(3));
|
||||||
|
} else {
|
||||||
|
if (morphFields == null) morphFields = new ArrayList<>(1);
|
||||||
|
morphFields.add(datum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return morphFields == null ? Collections.emptyList() : morphFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int addMorphFields(Map<String, Integer> indices, String morphFields) {
|
||||||
|
Integer alreadyCached = indices.get(morphFields);
|
||||||
|
if (alreadyCached != null) {
|
||||||
|
return alreadyCached;
|
||||||
|
}
|
||||||
|
|
||||||
|
int index = morphData.size();
|
||||||
|
indices.put(morphFields, index);
|
||||||
|
morphData.add(morphFields);
|
||||||
|
return index;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addPhoneticRepEntries(String word, String ph) {
|
private void addPhoneticRepEntries(String word, String ph) {
|
||||||
|
@ -1278,7 +1332,7 @@ public class Dictionary {
|
||||||
final FSTCompiler<IntsRef> words =
|
final FSTCompiler<IntsRef> words =
|
||||||
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
|
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
|
||||||
private final List<char[]> group = new ArrayList<>();
|
private final List<char[]> group = new ArrayList<>();
|
||||||
private final List<Integer> stemExceptionIDs = new ArrayList<>();
|
private final List<Integer> morphDataIDs = new ArrayList<>();
|
||||||
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
private String currentEntry = null;
|
private String currentEntry = null;
|
||||||
private final FlagEnumerator flagEnumerator;
|
private final FlagEnumerator flagEnumerator;
|
||||||
|
@ -1287,7 +1341,7 @@ public class Dictionary {
|
||||||
this.flagEnumerator = flagEnumerator;
|
this.flagEnumerator = flagEnumerator;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add(String entry, char[] flags, int stemExceptionID) throws IOException {
|
void add(String entry, char[] flags, int morphDataID) throws IOException {
|
||||||
if (!entry.equals(currentEntry)) {
|
if (!entry.equals(currentEntry)) {
|
||||||
if (currentEntry != null) {
|
if (currentEntry != null) {
|
||||||
if (entry.compareTo(currentEntry) < 0) {
|
if (entry.compareTo(currentEntry) < 0) {
|
||||||
|
@ -1299,8 +1353,8 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
group.add(flags);
|
group.add(flags);
|
||||||
if (hasStemExceptions) {
|
if (hasCustomMorphData) {
|
||||||
stemExceptionIDs.add(stemExceptionID);
|
morphDataIDs.add(morphDataID);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1322,8 +1376,8 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
currentOrds.append(flagEnumerator.add(flags));
|
currentOrds.append(flagEnumerator.add(flags));
|
||||||
if (hasStemExceptions) {
|
if (hasCustomMorphData) {
|
||||||
currentOrds.append(stemExceptionIDs.get(i));
|
currentOrds.append(morphDataIDs.get(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1331,7 +1385,7 @@ public class Dictionary {
|
||||||
words.add(scratchInts.get(), currentOrds.get());
|
words.add(scratchInts.get(), currentOrds.get());
|
||||||
|
|
||||||
group.clear();
|
group.clear();
|
||||||
stemExceptionIDs.clear();
|
morphDataIDs.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1365,10 +1419,6 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
String getStemException(int id) {
|
|
||||||
return stemExceptions[id - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
private void parseMorphAlias(String line) {
|
private void parseMorphAlias(String line) {
|
||||||
if (morphAliases == null) {
|
if (morphAliases == null) {
|
||||||
// first line should be the aliases count
|
// first line should be the aliases count
|
||||||
|
@ -1380,15 +1430,6 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasStemException(String morphData) {
|
|
||||||
for (String datum : splitMorphData(morphData)) {
|
|
||||||
if (datum.startsWith("st:")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> splitMorphData(String morphData) {
|
private List<String> splitMorphData(String morphData) {
|
||||||
// first see if it's an alias
|
// first see if it's an alias
|
||||||
if (morphAliasCount > 0) {
|
if (morphAliasCount > 0) {
|
||||||
|
@ -1401,9 +1442,13 @@ public class Dictionary {
|
||||||
if (morphData.isBlank()) {
|
if (morphData.isBlank()) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
return Arrays.stream(MORPH_KEY_PATTERN.split(morphData))
|
return Arrays.stream(morphData.split("\\s+"))
|
||||||
.map(String::trim)
|
.filter(
|
||||||
.filter(s -> !s.isBlank())
|
s ->
|
||||||
|
s.length() > 3
|
||||||
|
&& Character.isLetter(s.charAt(0))
|
||||||
|
&& Character.isLetter(s.charAt(1))
|
||||||
|
&& s.charAt(2) == ':')
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -156,7 +156,7 @@ public class Hunspell {
|
||||||
length,
|
length,
|
||||||
originalCase,
|
originalCase,
|
||||||
context,
|
context,
|
||||||
(stem, formID, stemException) -> {
|
(stem, formID, morphDataId) -> {
|
||||||
if (acceptsStem(formID)) {
|
if (acceptsStem(formID)) {
|
||||||
result[0] = new Root<>(stem, formID);
|
result[0] = new Root<>(stem, formID);
|
||||||
}
|
}
|
||||||
|
@ -253,6 +253,24 @@ public class Hunspell {
|
||||||
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
return cr1.toString().equalsIgnoreCase(cr2.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find all roots that could result in the given word after case conversion and adding affixes.
|
||||||
|
* This corresponds to the original {@code hunspell -s} (stemming) functionality.
|
||||||
|
*
|
||||||
|
* <p>Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are
|
||||||
|
* still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic
|
||||||
|
* file (but differ from some existing entries in case). No roots are returned for compound words.
|
||||||
|
*
|
||||||
|
* <p>The returned roots may be used to retrieve morphological data via {@link
|
||||||
|
* Dictionary#lookupEntries}.
|
||||||
|
*/
|
||||||
|
public List<String> getRoots(String word) {
|
||||||
|
return stemmer.stem(word).stream()
|
||||||
|
.map(CharsRef::toString)
|
||||||
|
.distinct()
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
private class CompoundPart {
|
private class CompoundPart {
|
||||||
final CompoundPart prev;
|
final CompoundPart prev;
|
||||||
final int index, length;
|
final int index, length;
|
||||||
|
|
|
@ -343,23 +343,28 @@ final class Stemmer {
|
||||||
* @param stem the text of the found dictionary entry
|
* @param stem the text of the found dictionary entry
|
||||||
* @param formID internal id of the dictionary entry, e.g. to be used in {@link
|
* @param formID internal id of the dictionary entry, e.g. to be used in {@link
|
||||||
* Dictionary#hasFlag(int, char)}
|
* Dictionary#hasFlag(int, char)}
|
||||||
* @param stemException "st:" morphological data if present, {@code null} otherwise
|
* @param morphDataId the id of the custom morphological data (0 if none), to be used with
|
||||||
|
* {@link Dictionary#morphData}
|
||||||
* @return whether the processing should be continued
|
* @return whether the processing should be continued
|
||||||
*/
|
*/
|
||||||
boolean processRoot(CharsRef stem, int formID, String stemException);
|
boolean processRoot(CharsRef stem, int formID, int morphDataId);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String stemException(IntsRef forms, int formIndex) {
|
private String stemException(int morphDataId) {
|
||||||
if (dictionary.hasStemExceptions) {
|
if (morphDataId > 0) {
|
||||||
int exceptionID = forms.ints[forms.offset + formIndex + 1];
|
String data = dictionary.morphData.get(morphDataId);
|
||||||
if (exceptionID > 0) {
|
int start = data.startsWith("st:") ? 0 : data.indexOf(" st:");
|
||||||
return dictionary.getStemException(exceptionID);
|
if (start >= 0) {
|
||||||
|
int nextSpace = data.indexOf(' ', start + 3);
|
||||||
|
return data.substring(start + 3, nextSpace < 0 ? data.length() : nextSpace);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private CharsRef newStem(CharsRef stem, String exception) {
|
private CharsRef newStem(CharsRef stem, int morphDataId) {
|
||||||
|
String exception = stemException(morphDataId);
|
||||||
|
|
||||||
if (dictionary.needsOutputCleaning) {
|
if (dictionary.needsOutputCleaning) {
|
||||||
scratchSegment.setLength(0);
|
scratchSegment.setLength(0);
|
||||||
if (exception != null) {
|
if (exception != null) {
|
||||||
|
@ -759,7 +764,8 @@ final class Stemmer {
|
||||||
private boolean callProcessor(
|
private boolean callProcessor(
|
||||||
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
|
char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
|
||||||
CharsRef stem = new CharsRef(word, offset, length);
|
CharsRef stem = new CharsRef(word, offset, length);
|
||||||
return processor.processRoot(stem, forms.ints[forms.offset + i], stemException(forms, i));
|
int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0;
|
||||||
|
return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
|
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) {
|
||||||
|
|
|
@ -43,6 +43,11 @@ public abstract class StemmerTestBase extends LuceneTestCase {
|
||||||
|
|
||||||
static void init(boolean ignoreCase, String affix, String... dictionaries)
|
static void init(boolean ignoreCase, String affix, String... dictionaries)
|
||||||
throws IOException, ParseException {
|
throws IOException, ParseException {
|
||||||
|
stemmer = new Stemmer(loadDictionary(ignoreCase, affix, dictionaries));
|
||||||
|
}
|
||||||
|
|
||||||
|
static Dictionary loadDictionary(boolean ignoreCase, String affix, String... dictionaries)
|
||||||
|
throws IOException, ParseException {
|
||||||
if (dictionaries.length == 0) {
|
if (dictionaries.length == 0) {
|
||||||
throw new IllegalArgumentException("there must be at least one dictionary");
|
throw new IllegalArgumentException("there must be at least one dictionary");
|
||||||
}
|
}
|
||||||
|
@ -52,7 +57,7 @@ public abstract class StemmerTestBase extends LuceneTestCase {
|
||||||
throw new FileNotFoundException("file not found: " + affix);
|
throw new FileNotFoundException("file not found: " + affix);
|
||||||
}
|
}
|
||||||
|
|
||||||
InputStream dictStreams[] = new InputStream[dictionaries.length];
|
InputStream[] dictStreams = new InputStream[dictionaries.length];
|
||||||
for (int i = 0; i < dictionaries.length; i++) {
|
for (int i = 0; i < dictionaries.length; i++) {
|
||||||
dictStreams[i] = StemmerTestBase.class.getResourceAsStream(dictionaries[i]);
|
dictStreams[i] = StemmerTestBase.class.getResourceAsStream(dictionaries[i]);
|
||||||
if (dictStreams[i] == null) {
|
if (dictStreams[i] == null) {
|
||||||
|
@ -61,14 +66,12 @@ public abstract class StemmerTestBase extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Dictionary dictionary =
|
return new Dictionary(
|
||||||
new Dictionary(
|
|
||||||
new ByteBuffersDirectory(),
|
new ByteBuffersDirectory(),
|
||||||
"dictionary",
|
"dictionary",
|
||||||
affixStream,
|
affixStream,
|
||||||
Arrays.asList(dictStreams),
|
Arrays.asList(dictStreams),
|
||||||
ignoreCase);
|
ignoreCase);
|
||||||
stemmer = new Stemmer(dictionary);
|
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeWhileHandlingException(affixStream);
|
IOUtils.closeWhileHandlingException(affixStream);
|
||||||
IOUtils.closeWhileHandlingException(dictStreams);
|
IOUtils.closeWhileHandlingException(dictStreams);
|
||||||
|
@ -80,7 +83,7 @@ public abstract class StemmerTestBase extends LuceneTestCase {
|
||||||
Arrays.sort(expected);
|
Arrays.sort(expected);
|
||||||
|
|
||||||
List<CharsRef> stems = stemmer.stem(s);
|
List<CharsRef> stems = stemmer.stem(s);
|
||||||
String actual[] = new String[stems.size()];
|
String[] actual = new String[stems.size()];
|
||||||
for (int i = 0; i < actual.length; i++) {
|
for (int i = 0; i < actual.length; i++) {
|
||||||
actual[i] = stems.get(i).toString();
|
actual[i] = stems.get(i).toString();
|
||||||
}
|
}
|
||||||
|
|
|
@ -205,6 +205,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
||||||
+ ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
|
+ ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ")
|
||||||
+ ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
|
+ ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ")
|
||||||
+ ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
|
+ ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ")
|
||||||
|
+ ("morphData=" + RamUsageTester.humanSizeOf(dic.morphData) + ", ")
|
||||||
+ ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
|
+ ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ")
|
||||||
+ ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
|
+ ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")");
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,10 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
@ -244,6 +248,35 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
assertEquals(src, new String(strategy.parseFlags(asAscii)));
|
assertEquals(src, new String(strategy.parseFlags(asAscii)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCustomMorphologicalData() throws IOException, ParseException {
|
||||||
|
Dictionary dic = loadDictionary("morphdata.aff", "morphdata.dic");
|
||||||
|
assertNull(dic.lookupEntries("nonexistent"));
|
||||||
|
|
||||||
|
DictEntries simpleNoun = dic.lookupEntries("simplenoun");
|
||||||
|
assertEquals(1, simpleNoun.size());
|
||||||
|
assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:"));
|
||||||
|
assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:"));
|
||||||
|
|
||||||
|
DictEntries lay = dic.lookupEntries("lay");
|
||||||
|
String actual =
|
||||||
|
IntStream.range(0, 3)
|
||||||
|
.mapToObj(lay::getMorphologicalData)
|
||||||
|
.sorted()
|
||||||
|
.collect(Collectors.joining("; "));
|
||||||
|
assertEquals("is:past_2 po:verb st:lie; is:present po:verb; po:noun", actual);
|
||||||
|
|
||||||
|
DictEntries sing = dic.lookupEntries("sing");
|
||||||
|
assertEquals(1, sing.size());
|
||||||
|
assertEquals(Arrays.asList("sang", "sung"), sing.getMorphologicalValues(0, "al:"));
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"al:abaléar po:verbo ts:transitiva",
|
||||||
|
dic.lookupEntries("unsupported1").getMorphologicalData(0));
|
||||||
|
|
||||||
|
assertEquals("", dic.lookupEntries("unsupported2").getMorphologicalData(0));
|
||||||
|
}
|
||||||
|
|
||||||
private Directory getDirectory() {
|
private Directory getDirectory() {
|
||||||
return newDirectory();
|
return newDirectory();
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,11 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.Collections;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestStemmer extends StemmerTestBase {
|
public class TestStemmer extends StemmerTestBase {
|
||||||
|
|
||||||
|
@ -58,6 +62,13 @@ public class TestStemmer extends StemmerTestBase {
|
||||||
assertStemsTo("solr", "olr");
|
assertStemsTo("solr", "olr");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHunspellStemmingApi() throws IOException, ParseException {
|
||||||
|
Hunspell hunspell = new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"));
|
||||||
|
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
|
||||||
|
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
|
||||||
|
}
|
||||||
|
|
||||||
// some bogus stuff that should not stem (empty lists)!
|
// some bogus stuff that should not stem (empty lists)!
|
||||||
public void testBogusStems() {
|
public void testBogusStems() {
|
||||||
assertStemsTo("abs");
|
assertStemsTo("abs");
|
||||||
|
|
|
@ -1,6 +1,12 @@
|
||||||
5
|
11
|
||||||
feet/X st:foot
|
feet/X st:foot
|
||||||
work/A st:workverb
|
work/A st:workverb
|
||||||
work/B st:worknoun
|
work/B st:worknoun
|
||||||
notspecial
|
notspecial
|
||||||
simplenoun/A
|
simplenoun/A fr:42
|
||||||
|
sing al:sang al:sung
|
||||||
|
lay po:verb st:lie is:past_2
|
||||||
|
lay po:verb is:present
|
||||||
|
lay po:noun
|
||||||
|
unsupported1 po:verbo ts:transitiva / intransitiva / pronominal VOLG: t i pr al:abaléar
|
||||||
|
unsupported2 [CAT=nc,G=f,N=s]
|
Loading…
Reference in New Issue