diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java new file mode 100644 index 00000000000..d9174dcbc7e --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/DictEntries.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.util.List; + +/** + * An object representing homonym dictionary entries. Note that the order of entries here may differ + * from the order in the *.dic file! + * + * @see Dictionary#lookupEntries + */ +public interface DictEntries { + /** + * @return a positive number of dictionary entries with the same word. Most often it's 1 (unless + * there are homonyms). Entries are indexed from 0 to {@code size() - 1} and these indices can + * be passed into other methods of this class. + */ + int size(); + + /** + * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive) + * @return morphological fields (of {@code kk:vvvvvv} form, sorted, space-separated, excluding + * {@code ph:}) associated with the homonym at the given entry index, or an empty string + */ + String getMorphologicalData(int entryIndex); + + /** + * @param entryIndex an index from 0 (inclusive) to {@link #size()} (exclusive) + * @param key the key in the form {@code kk:} by which to filter the morphological fields + * @return the values (of {@code vvvvvv} form) of morphological fields with the given key + * associated with the homonym at the given entry index + */ + List getMorphologicalValues(int entryIndex, String key); +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 59536fe4205..b65d287621d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -44,7 +44,6 @@ import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; -import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.Directory; @@ -83,7 +82,6 @@ public class Dictionary { // TODO: really for suffixes we should reverse the automaton and run them backwards private static final String PREFIX_CONDITION_REGEX = "%s.*"; private static final String SUFFIX_CONDITION_REGEX = ".*%s"; - private static final Pattern MORPH_KEY_PATTERN = Pattern.compile("\\s+(?=\\p{Alpha}{2}:)"); static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET); @@ -136,15 +134,13 @@ public class Dictionary { private String[] morphAliases; private int morphAliasCount = 0; - // st: morphological entries (either directly, or aliased from AM) - private String[] stemExceptions = new String[8]; - private int stemExceptionCount = 0; + final List morphData = new ArrayList<>(Collections.singletonList("")); // empty data at 0 /** - * we set this during sorting, so we know to add an extra FST output. when set, some words have - * exceptional stems, and the last entry is a pointer to stemExceptions + * we set this during sorting, so we know to add an extra int (index in {@link #morphData}) to FST + * output */ - boolean hasStemExceptions; + boolean hasCustomMorphData; boolean ignoreCase; boolean checkSharpS; @@ -274,7 +270,7 @@ public class Dictionary { } int formStep() { - return hasStemExceptions ? 2 : 1; + return hasCustomMorphData ? 2 : 1; } /** Looks up Hunspell word forms from the dictionary */ @@ -543,6 +539,44 @@ public class Dictionary { return false; } + /** + * @param root a string to look up in the dictionary. No case conversion or affix removal is + * performed. To get the possible roots of any word, you may call {@link + * Hunspell#getRoots(String)} + * @return the dictionary entries for the given root, or {@code null} if there's none + */ + public DictEntries lookupEntries(String root) { + IntsRef forms = lookupWord(root.toCharArray(), 0, root.length()); + if (forms == null) return null; + + return new DictEntries() { + @Override + public int size() { + return forms.length / (hasCustomMorphData ? 2 : 1); + } + + @Override + public String getMorphologicalData(int entryIndex) { + if (!hasCustomMorphData) return ""; + return morphData.get(forms.ints[forms.offset + entryIndex * 2 + 1]); + } + + @Override + public List getMorphologicalValues(int entryIndex, String key) { + assert key.length() == 3; + assert key.charAt(2) == ':'; + + String fields = getMorphologicalData(entryIndex); + if (fields.isEmpty() || !fields.contains(key)) return Collections.emptyList(); + + return Arrays.stream(fields.split(" ")) + .filter(s -> s.startsWith(key)) + .map(s -> s.substring(3)) + .collect(Collectors.toList()); + } + }; + } + static String extractLanguageCode(String isoCode) { int underscore = isoCode.indexOf("_"); return underscore < 0 ? isoCode : isoCode.substring(0, underscore); @@ -1024,11 +1058,13 @@ public class Dictionary { continue; } line = unescapeEntry(line); - // if we havent seen any stem exceptions, try to parse one - if (!hasStemExceptions) { + // if we haven't seen any custom morphological data, try to parse one + if (!hasCustomMorphData) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { - hasStemExceptions = hasStemException(line.substring(morphStart + 1)); + String data = line.substring(morphStart + 1); + hasCustomMorphData = + splitMorphData(data).stream().anyMatch(s -> !s.startsWith("ph:")); } } @@ -1156,6 +1192,8 @@ public class Dictionary { Directory tempDir, String sorted, FlagEnumerator flags) throws IOException { boolean success = false; + Map morphIndices = new HashMap<>(); + EntryGrouper grouper = new EntryGrouper(flags); try (ByteSequencesReader reader = @@ -1195,20 +1233,17 @@ public class Dictionary { } entry = line.substring(0, flagSep); } - // we possibly have morphological data - int stemExceptionID = 0; + + int morphDataID = 0; if (end + 1 < line.length()) { - String morphData = line.substring(end + 1); - for (String datum : splitMorphData(morphData)) { - if (datum.startsWith("st:")) { - stemExceptionID = addStemException(datum.substring(3)); - } else if (datum.startsWith("ph:") && datum.length() > 3) { - addPhoneticRepEntries(entry, datum.substring(3)); - } + List morphFields = readMorphFields(entry, line.substring(end + 1)); + if (!morphFields.isEmpty()) { + morphFields.sort(Comparator.naturalOrder()); + morphDataID = addMorphFields(morphIndices, String.join(" ", morphFields)); } } - grouper.add(entry, wordForm, stemExceptionID); + grouper.add(entry, wordForm, morphDataID); } // finalize last entry @@ -1224,10 +1259,29 @@ public class Dictionary { } } - private int addStemException(String stemException) { - stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1); - stemExceptions[stemExceptionCount++] = stemException; - return stemExceptionCount; // we use '0' to indicate no exception for the form + private List readMorphFields(String word, String unparsed) { + List morphFields = null; + for (String datum : splitMorphData(unparsed)) { + if (datum.startsWith("ph:")) { + addPhoneticRepEntries(word, datum.substring(3)); + } else { + if (morphFields == null) morphFields = new ArrayList<>(1); + morphFields.add(datum); + } + } + return morphFields == null ? Collections.emptyList() : morphFields; + } + + private int addMorphFields(Map indices, String morphFields) { + Integer alreadyCached = indices.get(morphFields); + if (alreadyCached != null) { + return alreadyCached; + } + + int index = morphData.size(); + indices.put(morphFields, index); + morphData.add(morphFields); + return index; } private void addPhoneticRepEntries(String word, String ph) { @@ -1278,7 +1332,7 @@ public class Dictionary { final FSTCompiler words = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton()); private final List group = new ArrayList<>(); - private final List stemExceptionIDs = new ArrayList<>(); + private final List morphDataIDs = new ArrayList<>(); private final IntsRefBuilder scratchInts = new IntsRefBuilder(); private String currentEntry = null; private final FlagEnumerator flagEnumerator; @@ -1287,7 +1341,7 @@ public class Dictionary { this.flagEnumerator = flagEnumerator; } - void add(String entry, char[] flags, int stemExceptionID) throws IOException { + void add(String entry, char[] flags, int morphDataID) throws IOException { if (!entry.equals(currentEntry)) { if (currentEntry != null) { if (entry.compareTo(currentEntry) < 0) { @@ -1299,8 +1353,8 @@ public class Dictionary { } group.add(flags); - if (hasStemExceptions) { - stemExceptionIDs.add(stemExceptionID); + if (hasCustomMorphData) { + morphDataIDs.add(morphDataID); } } @@ -1322,8 +1376,8 @@ public class Dictionary { } currentOrds.append(flagEnumerator.add(flags)); - if (hasStemExceptions) { - currentOrds.append(stemExceptionIDs.get(i)); + if (hasCustomMorphData) { + currentOrds.append(morphDataIDs.get(i)); } } @@ -1331,7 +1385,7 @@ public class Dictionary { words.add(scratchInts.get(), currentOrds.get()); group.clear(); - stemExceptionIDs.clear(); + morphDataIDs.clear(); } } @@ -1365,10 +1419,6 @@ public class Dictionary { } } - String getStemException(int id) { - return stemExceptions[id - 1]; - } - private void parseMorphAlias(String line) { if (morphAliases == null) { // first line should be the aliases count @@ -1380,15 +1430,6 @@ public class Dictionary { } } - private boolean hasStemException(String morphData) { - for (String datum : splitMorphData(morphData)) { - if (datum.startsWith("st:")) { - return true; - } - } - return false; - } - private List splitMorphData(String morphData) { // first see if it's an alias if (morphAliasCount > 0) { @@ -1401,9 +1442,13 @@ public class Dictionary { if (morphData.isBlank()) { return Collections.emptyList(); } - return Arrays.stream(MORPH_KEY_PATTERN.split(morphData)) - .map(String::trim) - .filter(s -> !s.isBlank()) + return Arrays.stream(morphData.split("\\s+")) + .filter( + s -> + s.length() > 3 + && Character.isLetter(s.charAt(0)) + && Character.isLetter(s.charAt(1)) + && s.charAt(2) == ':') .collect(Collectors.toList()); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java index db0e3e4e837..04bd55840ce 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java @@ -156,7 +156,7 @@ public class Hunspell { length, originalCase, context, - (stem, formID, stemException) -> { + (stem, formID, morphDataId) -> { if (acceptsStem(formID)) { result[0] = new Root<>(stem, formID); } @@ -253,6 +253,24 @@ public class Hunspell { return cr1.toString().equalsIgnoreCase(cr2.toString()); } + /** + * Find all roots that could result in the given word after case conversion and adding affixes. + * This corresponds to the original {@code hunspell -s} (stemming) functionality. + * + *

Some affix rules are relaxed in this stemming process: e.g. explicitly forbidden words are + * still returned. Some of the returned roots may be synthetic and not directly occur in the *.dic + * file (but differ from some existing entries in case). No roots are returned for compound words. + * + *

The returned roots may be used to retrieve morphological data via {@link + * Dictionary#lookupEntries}. + */ + public List getRoots(String word) { + return stemmer.stem(word).stream() + .map(CharsRef::toString) + .distinct() + .collect(Collectors.toList()); + } + private class CompoundPart { final CompoundPart prev; final int index, length; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 8afd9fc5d85..ceb47b2bd98 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -343,23 +343,28 @@ final class Stemmer { * @param stem the text of the found dictionary entry * @param formID internal id of the dictionary entry, e.g. to be used in {@link * Dictionary#hasFlag(int, char)} - * @param stemException "st:" morphological data if present, {@code null} otherwise + * @param morphDataId the id of the custom morphological data (0 if none), to be used with + * {@link Dictionary#morphData} * @return whether the processing should be continued */ - boolean processRoot(CharsRef stem, int formID, String stemException); + boolean processRoot(CharsRef stem, int formID, int morphDataId); } - private String stemException(IntsRef forms, int formIndex) { - if (dictionary.hasStemExceptions) { - int exceptionID = forms.ints[forms.offset + formIndex + 1]; - if (exceptionID > 0) { - return dictionary.getStemException(exceptionID); + private String stemException(int morphDataId) { + if (morphDataId > 0) { + String data = dictionary.morphData.get(morphDataId); + int start = data.startsWith("st:") ? 0 : data.indexOf(" st:"); + if (start >= 0) { + int nextSpace = data.indexOf(' ', start + 3); + return data.substring(start + 3, nextSpace < 0 ? data.length() : nextSpace); } } return null; } - private CharsRef newStem(CharsRef stem, String exception) { + private CharsRef newStem(CharsRef stem, int morphDataId) { + String exception = stemException(morphDataId); + if (dictionary.needsOutputCleaning) { scratchSegment.setLength(0); if (exception != null) { @@ -759,7 +764,8 @@ final class Stemmer { private boolean callProcessor( char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) { CharsRef stem = new CharsRef(word, offset, length); - return processor.processRoot(stem, forms.ints[forms.offset + i], stemException(forms, i)); + int morphDataId = dictionary.hasCustomMorphData ? forms.ints[forms.offset + i + 1] : 0; + return processor.processRoot(stem, forms.ints[forms.offset + i], morphDataId); } private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix, int prefixId) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java index d650c8e1a28..5ea1d334557 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/StemmerTestBase.java @@ -43,6 +43,11 @@ public abstract class StemmerTestBase extends LuceneTestCase { static void init(boolean ignoreCase, String affix, String... dictionaries) throws IOException, ParseException { + stemmer = new Stemmer(loadDictionary(ignoreCase, affix, dictionaries)); + } + + static Dictionary loadDictionary(boolean ignoreCase, String affix, String... dictionaries) + throws IOException, ParseException { if (dictionaries.length == 0) { throw new IllegalArgumentException("there must be at least one dictionary"); } @@ -52,7 +57,7 @@ public abstract class StemmerTestBase extends LuceneTestCase { throw new FileNotFoundException("file not found: " + affix); } - InputStream dictStreams[] = new InputStream[dictionaries.length]; + InputStream[] dictStreams = new InputStream[dictionaries.length]; for (int i = 0; i < dictionaries.length; i++) { dictStreams[i] = StemmerTestBase.class.getResourceAsStream(dictionaries[i]); if (dictStreams[i] == null) { @@ -61,14 +66,12 @@ public abstract class StemmerTestBase extends LuceneTestCase { } try { - Dictionary dictionary = - new Dictionary( - new ByteBuffersDirectory(), - "dictionary", - affixStream, - Arrays.asList(dictStreams), - ignoreCase); - stemmer = new Stemmer(dictionary); + return new Dictionary( + new ByteBuffersDirectory(), + "dictionary", + affixStream, + Arrays.asList(dictStreams), + ignoreCase); } finally { IOUtils.closeWhileHandlingException(affixStream); IOUtils.closeWhileHandlingException(dictStreams); @@ -80,7 +83,7 @@ public abstract class StemmerTestBase extends LuceneTestCase { Arrays.sort(expected); List stems = stemmer.stem(s); - String actual[] = new String[stems.size()]; + String[] actual = new String[stems.size()]; for (int i = 0; i < actual.length; i++) { actual[i] = stems.get(i).toString(); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index f64c6d8d509..acef45bb4e1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -205,6 +205,7 @@ public class TestAllDictionaries extends LuceneTestCase { + ("strips=" + RamUsageTester.humanSizeOf(dic.stripData) + ", ") + ("conditions=" + RamUsageTester.humanSizeOf(dic.patterns) + ", ") + ("affixData=" + RamUsageTester.humanSizeOf(dic.affixData) + ", ") + + ("morphData=" + RamUsageTester.humanSizeOf(dic.morphData) + ", ") + ("prefixes=" + RamUsageTester.humanSizeOf(dic.prefixes) + ", ") + ("suffixes=" + RamUsageTester.humanSizeOf(dic.suffixes) + ")"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 6ef783cb5a9..2cc0c8495ac 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -22,6 +22,10 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.text.ParseException; +import java.util.Arrays; +import java.util.Collections; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.util.CharsRef; @@ -244,6 +248,35 @@ public class TestDictionary extends LuceneTestCase { assertEquals(src, new String(strategy.parseFlags(asAscii))); } + @Test + public void testCustomMorphologicalData() throws IOException, ParseException { + Dictionary dic = loadDictionary("morphdata.aff", "morphdata.dic"); + assertNull(dic.lookupEntries("nonexistent")); + + DictEntries simpleNoun = dic.lookupEntries("simplenoun"); + assertEquals(1, simpleNoun.size()); + assertEquals(Collections.emptyList(), simpleNoun.getMorphologicalValues(0, "aa:")); + assertEquals(Collections.singletonList("42"), simpleNoun.getMorphologicalValues(0, "fr:")); + + DictEntries lay = dic.lookupEntries("lay"); + String actual = + IntStream.range(0, 3) + .mapToObj(lay::getMorphologicalData) + .sorted() + .collect(Collectors.joining("; ")); + assertEquals("is:past_2 po:verb st:lie; is:present po:verb; po:noun", actual); + + DictEntries sing = dic.lookupEntries("sing"); + assertEquals(1, sing.size()); + assertEquals(Arrays.asList("sang", "sung"), sing.getMorphologicalValues(0, "al:")); + + assertEquals( + "al:abaléar po:verbo ts:transitiva", + dic.lookupEntries("unsupported1").getMorphologicalData(0)); + + assertEquals("", dic.lookupEntries("unsupported2").getMorphologicalData(0)); + } + private Directory getDirectory() { return newDirectory(); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java index 44a62c3a54f..1aa0b140fca 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java @@ -16,7 +16,11 @@ */ package org.apache.lucene.analysis.hunspell; +import java.io.IOException; +import java.text.ParseException; +import java.util.Collections; import org.junit.BeforeClass; +import org.junit.Test; public class TestStemmer extends StemmerTestBase { @@ -58,6 +62,13 @@ public class TestStemmer extends StemmerTestBase { assertStemsTo("solr", "olr"); } + @Test + public void testHunspellStemmingApi() throws IOException, ParseException { + Hunspell hunspell = new Hunspell(loadDictionary(false, "simple.aff", "simple.dic")); + assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache")); + assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo")); + } + // some bogus stuff that should not stem (empty lists)! public void testBogusStems() { assertStemsTo("abs"); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/morphdata.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/morphdata.dic index 9b7cc9d50b8..6c5d789ff53 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/morphdata.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/morphdata.dic @@ -1,6 +1,12 @@ -5 +11 feet/X st:foot work/A st:workverb work/B st:worknoun notspecial -simplenoun/A +simplenoun/A fr:42 +sing al:sang al:sung +lay po:verb st:lie is:past_2 +lay po:verb is:present +lay po:noun +unsupported1 po:verbo ts:transitiva / intransitiva / pronominal VOLG: t i pr al:abaléar +unsupported2 [CAT=nc,G=f,N=s] \ No newline at end of file