From 422c89baefda8e8e8a9a3f9b761785895b5e9bb5 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Tue, 19 Jan 2021 09:32:23 +0100 Subject: [PATCH] LUCENE-9676: Hunspell: improve stemming of all-caps words (#2217) Hunspell: improve stemming of all-caps words Repeat Hunspell's logic: * when encountering a mixed- or (inflectable) all-case dictionary entry, add its title-case analog as a hidden entry * use that hidden entry for stemming case variants for title- and uppercase words, but don't consider it a valid word itself * ...unless there's another explicit dictionary entry of that title case --- lucene/CHANGES.txt | 3 +- .../lucene/analysis/hunspell/Dictionary.java | 212 +++++++++++------- .../lucene/analysis/hunspell/Stemmer.java | 92 +++----- .../lucene/analysis/hunspell/WordCase.java | 61 +++++ .../lucene/analysis/hunspell/TestAllCaps.java | 42 ++++ .../lucene/analysis/hunspell/TestEscaped.java | 2 +- .../lucene/analysis/hunspell/allcaps.aff | 5 + .../lucene/analysis/hunspell/allcaps.dic | 3 + .../lucene/analysis/hunspell/escaped.aff | 1 + 9 files changed, 283 insertions(+), 138 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f21e0829269..9dec819134b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -84,7 +84,8 @@ API Changes Improvements -* LUCENE-9665: Hunspell: support default encoding (Peter Gromov) +* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words + (Peter Gromov) * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (Dawid Weiss) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 81b4beecf4e..34edb73c1a1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -73,6 +73,8 @@ public class Dictionary { static final char[] NOFLAGS = new char[0]; + private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell + private static final String ALIAS_KEY = "AF"; private static final String MORPH_ALIAS_KEY = "AM"; private static final String PREFIX_KEY = "PFX"; @@ -238,10 +240,9 @@ public class Dictionary { readAffixFile(aff2, decoder); // read dictionary entries - IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); - FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o); - readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler); - words = fstCompiler.compile(); + IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder); + String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted); + words = readSortedDictionaries(tempDir, sortedFile); aliases = null; // no longer needed morphAliases = null; // no longer needed success = true; @@ -791,25 +792,13 @@ public class Dictionary { } } - /** - * Reads the dictionary file through the provided InputStreams, building up the words map - * - * @param dictionaries InputStreams to read the dictionary file through - * @param decoder CharsetDecoder used to decode the contents of the file - * @throws IOException Can be thrown while reading from the file - */ - private void readDictionaryFiles( + private IndexOutput mergeDictionaries( Directory tempDir, String tempFileNamePrefix, List dictionaries, - CharsetDecoder decoder, - FSTCompiler words) + CharsetDecoder decoder) throws IOException { - BytesRefBuilder flagsScratch = new BytesRefBuilder(); - IntsRefBuilder scratchInts = new IntsRefBuilder(); - StringBuilder sb = new StringBuilder(); - IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT); try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { for (InputStream dictionary : dictionaries) { @@ -833,32 +822,58 @@ public class Dictionary { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } - if (needsInputCleaning) { - int flagSep = line.indexOf(FLAG_SEPARATOR); - if (flagSep == -1) { - flagSep = line.indexOf(MORPH_SEPARATOR); - } - if (flagSep == -1) { - CharSequence cleansed = cleanInput(line, sb); - writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); - } else { - String text = line.substring(0, flagSep); - CharSequence cleansed = cleanInput(text, sb); - if (cleansed != sb) { - sb.setLength(0); - sb.append(cleansed); - } - sb.append(line.substring(flagSep)); - writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); - } - } else { - writer.write(line.getBytes(StandardCharsets.UTF_8)); - } + + writeNormalizedWordEntry(sb, writer, line); } } CodecUtil.writeFooter(unsorted); } + return unsorted; + } + private void writeNormalizedWordEntry( + StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException { + int flagSep = line.indexOf(FLAG_SEPARATOR); + int morphSep = line.indexOf(MORPH_SEPARATOR); + assert morphSep > 0; + assert morphSep > flagSep; + int sep = flagSep < 0 ? morphSep : flagSep; + + CharSequence toWrite; + if (needsInputCleaning) { + cleanInput(line, sep, reuse); + reuse.append(line, sep, line.length()); + toWrite = reuse; + } else { + toWrite = line; + } + + String written = toWrite.toString(); + sep = written.length() - (line.length() - sep); + writer.write(written.getBytes(StandardCharsets.UTF_8)); + + WordCase wordCase = WordCase.caseOf(written, sep); + if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) { + addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep)); + } + } + + private void addHiddenCapitalizedWord( + StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep) + throws IOException { + reuse.setLength(0); + reuse.append(Character.toUpperCase(word.charAt(0))); + for (int i = 1; i < word.length(); i++) { + reuse.append(caseFold(word.charAt(i))); + } + reuse.append(FLAG_SEPARATOR); + reuse.append(HIDDEN_FLAG); + reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length()); + writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8)); + } + + private String sortWordsOffline( + Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException { OfflineSorter sorter = new OfflineSorter( tempDir, @@ -908,8 +923,13 @@ public class Dictionary { IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName()); } } + return sorted; + } - boolean success2 = false; + private FST readSortedDictionaries(Directory tempDir, String sorted) throws IOException { + boolean success = false; + + EntryGrouper grouper = new EntryGrouper(); try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) { @@ -917,9 +937,6 @@ public class Dictionary { // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently - String currentEntry = null; - IntsRefBuilder currentOrds = new IntsRefBuilder(); - while (true) { BytesRef scratch = reader.next(); if (scratch == null) { @@ -959,42 +976,15 @@ public class Dictionary { } } - int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); - if (cmp < 0) { - throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); - } else { - encodeFlags(flagsScratch, wordForm); - int ord = flagLookup.add(flagsScratch.get()); - if (ord < 0) { - // already exists in our hash - ord = (-ord) - 1; - } - // finalize current entry, and switch "current" if necessary - if (cmp > 0 && currentEntry != null) { - Util.toUTF32(currentEntry, scratchInts); - words.add(scratchInts.get(), currentOrds.get()); - } - // swap current - if (cmp > 0) { - currentEntry = entry; - currentOrds = new IntsRefBuilder(); // must be this way - } - if (hasStemExceptions) { - currentOrds.append(ord); - currentOrds.append(stemExceptionID); - } else { - currentOrds.append(ord); - } - } + grouper.add(entry, wordForm, stemExceptionID); } // finalize last entry - assert currentEntry != null; - Util.toUTF32(currentEntry, scratchInts); - words.add(scratchInts.get(), currentOrds.get()); - success2 = true; + grouper.flushGroup(); + success = true; + return grouper.words.compile(); } finally { - if (success2) { + if (success) { tempDir.deleteFile(sorted); } else { IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted); @@ -1002,6 +992,72 @@ public class Dictionary { } } + private class EntryGrouper { + final FSTCompiler words = + new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton()); + private final List group = new ArrayList<>(); + private final List stemExceptionIDs = new ArrayList<>(); + private final BytesRefBuilder flagsScratch = new BytesRefBuilder(); + private final IntsRefBuilder scratchInts = new IntsRefBuilder(); + private String currentEntry = null; + + void add(String entry, char[] flags, int stemExceptionID) throws IOException { + if (!entry.equals(currentEntry)) { + if (currentEntry != null) { + if (entry.compareTo(currentEntry) < 0) { + throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); + } + flushGroup(); + } + currentEntry = entry; + } + + group.add(flags); + if (hasStemExceptions) { + stemExceptionIDs.add(stemExceptionID); + } + } + + void flushGroup() throws IOException { + IntsRefBuilder currentOrds = new IntsRefBuilder(); + + boolean hasNonHidden = false; + for (char[] flags : group) { + if (!hasHiddenFlag(flags)) { + hasNonHidden = true; + break; + } + } + + for (int i = 0; i < group.size(); i++) { + char[] flags = group.get(i); + if (hasNonHidden && hasHiddenFlag(flags)) { + continue; + } + + encodeFlags(flagsScratch, flags); + int ord = flagLookup.add(flagsScratch.get()); + if (ord < 0) { + ord = -ord - 1; // already exists in our hash + } + currentOrds.append(ord); + if (hasStemExceptions) { + currentOrds.append(stemExceptionIDs.get(i)); + } + } + + Util.toUTF32(currentEntry, scratchInts); + words.add(scratchInts.get(), currentOrds.get()); + + group.clear(); + stemExceptionIDs.clear(); + } + } + + static boolean hasHiddenFlag(char[] flags) { + return hasFlag(flags, HIDDEN_FLAG); + } + static char[] decodeFlags(BytesRef b) { if (b.length == 0) { return CharsRef.EMPTY_CHARS; @@ -1191,9 +1247,13 @@ public class Dictionary { } CharSequence cleanInput(CharSequence input, StringBuilder reuse) { + return cleanInput(input, input.length(), reuse); + } + + private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) { reuse.setLength(0); - for (int i = 0; i < input.length(); i++) { + for (int i = 0; i < prefixLength; i++) { char ch = input.charAt(i); if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 7b50da26323..413e570f2a1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -95,57 +95,30 @@ final class Stemmer { word = scratchBuffer; } - int caseType = caseOf(word, length); - if (caseType == UPPER_CASE) { - // upper: union exact, title, lower + WordCase wordCase = caseOf(word, length); + List list = doStem(word, length, false); + if (wordCase == WordCase.UPPER) { caseFoldTitle(word, length); - caseFoldLower(titleBuffer, length); - List list = doStem(word, length, false); list.addAll(doStem(titleBuffer, length, true)); - list.addAll(doStem(lowerBuffer, length, true)); - return list; - } else if (caseType == TITLE_CASE) { - // title: union exact, lower - caseFoldLower(word, length); - List list = doStem(word, length, false); - list.addAll(doStem(lowerBuffer, length, true)); - return list; - } else { - // exact match only - return doStem(word, length, false); } + if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) { + caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length); + list.addAll(doStem(lowerBuffer, length, true)); + } + return list; } // temporary buffers for case variants private char[] lowerBuffer = new char[8]; private char[] titleBuffer = new char[8]; - private static final int EXACT_CASE = 0; - private static final int TITLE_CASE = 1; - private static final int UPPER_CASE = 2; - /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */ - private int caseOf(char[] word, int length) { + private WordCase caseOf(char[] word, int length) { if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) { - return EXACT_CASE; + return WordCase.MIXED; } - // determine if we are title or lowercase (or something funky, in which it's exact) - boolean seenUpper = false; - boolean seenLower = false; - for (int i = 1; i < length; i++) { - boolean v = Character.isUpperCase(word[i]); - seenUpper |= v; - seenLower |= !v; - } - - if (!seenLower) { - return UPPER_CASE; - } else if (!seenUpper) { - return TITLE_CASE; - } else { - return EXACT_CASE; - } + return WordCase.caseOf(word, length); } /** folds titlecase variant of word to titleBuffer */ @@ -169,25 +142,20 @@ final class Stemmer { IntsRef forms = dictionary.lookupWord(word, 0, length); if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { - boolean checkKeepCase = caseVariant && dictionary.keepcase != -1; - boolean checkNeedAffix = dictionary.needaffix != -1; - boolean checkOnlyInCompound = dictionary.onlyincompound != -1; - if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) { - dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch); - char[] wordFlags = Dictionary.decodeFlags(scratch); - // we are looking for a case variant, but this word does not allow it - if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) { - continue; - } - // we can't add this form, it's a pseudostem requiring an affix - if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) { - continue; - } - // we can't add this form, it only belongs inside a compound word - if (checkOnlyInCompound - && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) { - continue; - } + dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch); + char[] wordFlags = Dictionary.decodeFlags(scratch); + if (!acceptCase(caseVariant, wordFlags)) { + continue; + } + // we can't add this form, it's a pseudostem requiring an affix + if (dictionary.needaffix != -1 + && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) { + continue; + } + // we can't add this form, it only belongs inside a compound word + if (dictionary.onlyincompound != -1 + && Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) { + continue; } stems.add(newStem(word, length, forms, i)); } @@ -200,6 +168,12 @@ final class Stemmer { return stems; } + private boolean acceptCase(boolean caseVariant, char[] wordFlags) { + return caseVariant + ? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase) + : !Dictionary.hasHiddenFlag(wordFlags); + } + /** * Find the unique stem(s) of the provided word * @@ -595,9 +569,7 @@ final class Stemmer { } // we are looking for a case variant, but this word does not allow it - if (caseVariant - && dictionary.keepcase != -1 - && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) { + if (!acceptCase(caseVariant, wordFlags)) { continue; } // we aren't decompounding (yet) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java new file mode 100644 index 00000000000..7d9e2e75873 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +enum WordCase { + UPPER, + TITLE, + LOWER, + MIXED; + + static WordCase caseOf(char[] word, int length) { + boolean capitalized = Character.isUpperCase(word[0]); + + boolean seenUpper = false; + boolean seenLower = false; + for (int i = 1; i < length; i++) { + char ch = word[i]; + seenUpper = seenUpper || Character.isUpperCase(ch); + seenLower = seenLower || Character.isLowerCase(ch); + if (seenUpper && seenLower) break; + } + + return get(capitalized, seenUpper, seenLower); + } + + static WordCase caseOf(CharSequence word, int length) { + boolean capitalized = Character.isUpperCase(word.charAt(0)); + + boolean seenUpper = false; + boolean seenLower = false; + for (int i = 1; i < length; i++) { + char ch = word.charAt(i); + seenUpper = seenUpper || Character.isUpperCase(ch); + seenLower = seenLower || Character.isLowerCase(ch); + if (seenUpper && seenLower) break; + } + + return get(capitalized, seenUpper, seenLower); + } + + private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) { + if (capitalized) { + return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED; + } + return seenUpper ? MIXED : LOWER; + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java new file mode 100644 index 00000000000..43c67644b3e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import org.junit.BeforeClass; + +public class TestAllCaps extends StemmerTestBase { + + @BeforeClass + public static void beforeClass() throws Exception { + init("allcaps.aff", "allcaps.dic"); + } + + public void testGood() { + assertStemsTo("OpenOffice.org", "OpenOffice.org"); + assertStemsTo("UNICEF's", "UNICEF"); + + // Hunspell returns these title-cased stems, so for consistency we do, too + assertStemsTo("OPENOFFICE.ORG", "Openoffice.org"); + assertStemsTo("UNICEF'S", "Unicef"); + } + + public void testWrong() { + assertStemsTo("Openoffice.org"); + assertStemsTo("Unicef"); + assertStemsTo("Unicef's"); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java index a5e9fe1d0a0..3038385665f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestEscaped.java @@ -27,7 +27,7 @@ public class TestEscaped extends StemmerTestBase { public void testStemming() { assertStemsTo("works", "work"); assertStemsTo("work", "work"); - assertStemsTo("R2/D2", "R2/D2"); + assertStemsTo("R2/D2", "R2/D2", "R2/d2"); assertStemsTo("R2/D2s", "R2/D2"); assertStemsTo("N/A", "N/A"); assertStemsTo("N/As"); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff new file mode 100644 index 00000000000..57e916bf537 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff @@ -0,0 +1,5 @@ +# check uppercase forms of allcaps word + affix and words with mixed casing +WORDCHARS '. + +SFX S N 1 +SFX S 0 's . diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic new file mode 100644 index 00000000000..7d3cdcc0469 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.dic @@ -0,0 +1,3 @@ +2 +OpenOffice.org +UNICEF/S diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff index b42845175e2..a66ee3695f5 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/escaped.aff @@ -1,4 +1,5 @@ SET UTF-8 +WORDCHARS \/0123456789 SFX A Y 1 SFX A 0 s . +PLUR