LUCENE-9676: Hunspell: improve stemming of all-caps words (#2217)

Hunspell: improve stemming of all-caps words

Repeat Hunspell's logic:
* when encountering a mixed- or (inflectable) all-case dictionary entry, add its title-case analog as a hidden entry
* use that hidden entry for stemming case variants for title- and uppercase words, but don't consider it a valid word itself
* ...unless there's another explicit dictionary entry of that title case
This commit is contained in:
Peter Gromov 2021-01-19 09:32:23 +01:00 committed by GitHub
parent c1ae6dc07c
commit 422c89baef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 283 additions and 138 deletions

View File

@ -84,7 +84,8 @@ API Changes
Improvements
* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
(Peter Gromov)
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss)

View File

@ -73,6 +73,8 @@ public class Dictionary {
static final char[] NOFLAGS = new char[0];
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
private static final String ALIAS_KEY = "AF";
private static final String MORPH_ALIAS_KEY = "AM";
private static final String PREFIX_KEY = "PFX";
@ -238,10 +240,9 @@ public class Dictionary {
readAffixFile(aff2, decoder);
// read dictionary entries
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
words = fstCompiler.compile();
IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
words = readSortedDictionaries(tempDir, sortedFile);
aliases = null; // no longer needed
morphAliases = null; // no longer needed
success = true;
@ -791,25 +792,13 @@ public class Dictionary {
}
}
/**
* Reads the dictionary file through the provided InputStreams, building up the words map
*
* @param dictionaries InputStreams to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFiles(
private IndexOutput mergeDictionaries(
Directory tempDir,
String tempFileNamePrefix,
List<InputStream> dictionaries,
CharsetDecoder decoder,
FSTCompiler<IntsRef> words)
CharsetDecoder decoder)
throws IOException {
BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
StringBuilder sb = new StringBuilder();
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
for (InputStream dictionary : dictionaries) {
@ -833,32 +822,58 @@ public class Dictionary {
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
}
}
if (needsInputCleaning) {
int flagSep = line.indexOf(FLAG_SEPARATOR);
if (flagSep == -1) {
flagSep = line.indexOf(MORPH_SEPARATOR);
}
if (flagSep == -1) {
CharSequence cleansed = cleanInput(line, sb);
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
} else {
String text = line.substring(0, flagSep);
CharSequence cleansed = cleanInput(text, sb);
if (cleansed != sb) {
sb.setLength(0);
sb.append(cleansed);
}
sb.append(line.substring(flagSep));
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
}
} else {
writer.write(line.getBytes(StandardCharsets.UTF_8));
}
writeNormalizedWordEntry(sb, writer, line);
}
}
CodecUtil.writeFooter(unsorted);
}
return unsorted;
}
private void writeNormalizedWordEntry(
StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
int flagSep = line.indexOf(FLAG_SEPARATOR);
int morphSep = line.indexOf(MORPH_SEPARATOR);
assert morphSep > 0;
assert morphSep > flagSep;
int sep = flagSep < 0 ? morphSep : flagSep;
CharSequence toWrite;
if (needsInputCleaning) {
cleanInput(line, sep, reuse);
reuse.append(line, sep, line.length());
toWrite = reuse;
} else {
toWrite = line;
}
String written = toWrite.toString();
sep = written.length() - (line.length() - sep);
writer.write(written.getBytes(StandardCharsets.UTF_8));
WordCase wordCase = WordCase.caseOf(written, sep);
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
}
}
private void addHiddenCapitalizedWord(
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
throws IOException {
reuse.setLength(0);
reuse.append(Character.toUpperCase(word.charAt(0)));
for (int i = 1; i < word.length(); i++) {
reuse.append(caseFold(word.charAt(i)));
}
reuse.append(FLAG_SEPARATOR);
reuse.append(HIDDEN_FLAG);
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
}
private String sortWordsOffline(
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
OfflineSorter sorter =
new OfflineSorter(
tempDir,
@ -908,8 +923,13 @@ public class Dictionary {
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
}
}
return sorted;
}
boolean success2 = false;
private FST<IntsRef> readSortedDictionaries(Directory tempDir, String sorted) throws IOException {
boolean success = false;
EntryGrouper grouper = new EntryGrouper();
try (ByteSequencesReader reader =
new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
@ -917,9 +937,6 @@ public class Dictionary {
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
String currentEntry = null;
IntsRefBuilder currentOrds = new IntsRefBuilder();
while (true) {
BytesRef scratch = reader.next();
if (scratch == null) {
@ -959,42 +976,15 @@ public class Dictionary {
}
}
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
if (cmp < 0) {
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
} else {
encodeFlags(flagsScratch, wordForm);
int ord = flagLookup.add(flagsScratch.get());
if (ord < 0) {
// already exists in our hash
ord = (-ord) - 1;
}
// finalize current entry, and switch "current" if necessary
if (cmp > 0 && currentEntry != null) {
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
}
// swap current
if (cmp > 0) {
currentEntry = entry;
currentOrds = new IntsRefBuilder(); // must be this way
}
if (hasStemExceptions) {
currentOrds.append(ord);
currentOrds.append(stemExceptionID);
} else {
currentOrds.append(ord);
}
}
grouper.add(entry, wordForm, stemExceptionID);
}
// finalize last entry
assert currentEntry != null;
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
success2 = true;
grouper.flushGroup();
success = true;
return grouper.words.compile();
} finally {
if (success2) {
if (success) {
tempDir.deleteFile(sorted);
} else {
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
@ -1002,6 +992,72 @@ public class Dictionary {
}
}
private class EntryGrouper {
final FSTCompiler<IntsRef> words =
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
private final List<char[]> group = new ArrayList<>();
private final List<Integer> stemExceptionIDs = new ArrayList<>();
private final BytesRefBuilder flagsScratch = new BytesRefBuilder();
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
private String currentEntry = null;
void add(String entry, char[] flags, int stemExceptionID) throws IOException {
if (!entry.equals(currentEntry)) {
if (currentEntry != null) {
if (entry.compareTo(currentEntry) < 0) {
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
}
flushGroup();
}
currentEntry = entry;
}
group.add(flags);
if (hasStemExceptions) {
stemExceptionIDs.add(stemExceptionID);
}
}
void flushGroup() throws IOException {
IntsRefBuilder currentOrds = new IntsRefBuilder();
boolean hasNonHidden = false;
for (char[] flags : group) {
if (!hasHiddenFlag(flags)) {
hasNonHidden = true;
break;
}
}
for (int i = 0; i < group.size(); i++) {
char[] flags = group.get(i);
if (hasNonHidden && hasHiddenFlag(flags)) {
continue;
}
encodeFlags(flagsScratch, flags);
int ord = flagLookup.add(flagsScratch.get());
if (ord < 0) {
ord = -ord - 1; // already exists in our hash
}
currentOrds.append(ord);
if (hasStemExceptions) {
currentOrds.append(stemExceptionIDs.get(i));
}
}
Util.toUTF32(currentEntry, scratchInts);
words.add(scratchInts.get(), currentOrds.get());
group.clear();
stemExceptionIDs.clear();
}
}
static boolean hasHiddenFlag(char[] flags) {
return hasFlag(flags, HIDDEN_FLAG);
}
static char[] decodeFlags(BytesRef b) {
if (b.length == 0) {
return CharsRef.EMPTY_CHARS;
@ -1191,9 +1247,13 @@ public class Dictionary {
}
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
return cleanInput(input, input.length(), reuse);
}
private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
reuse.setLength(0);
for (int i = 0; i < input.length(); i++) {
for (int i = 0; i < prefixLength; i++) {
char ch = input.charAt(i);
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {

View File

@ -95,57 +95,30 @@ final class Stemmer {
word = scratchBuffer;
}
int caseType = caseOf(word, length);
if (caseType == UPPER_CASE) {
// upper: union exact, title, lower
WordCase wordCase = caseOf(word, length);
List<CharsRef> list = doStem(word, length, false);
if (wordCase == WordCase.UPPER) {
caseFoldTitle(word, length);
caseFoldLower(titleBuffer, length);
List<CharsRef> list = doStem(word, length, false);
list.addAll(doStem(titleBuffer, length, true));
list.addAll(doStem(lowerBuffer, length, true));
return list;
} else if (caseType == TITLE_CASE) {
// title: union exact, lower
caseFoldLower(word, length);
List<CharsRef> list = doStem(word, length, false);
list.addAll(doStem(lowerBuffer, length, true));
return list;
} else {
// exact match only
return doStem(word, length, false);
}
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
list.addAll(doStem(lowerBuffer, length, true));
}
return list;
}
// temporary buffers for case variants
private char[] lowerBuffer = new char[8];
private char[] titleBuffer = new char[8];
private static final int EXACT_CASE = 0;
private static final int TITLE_CASE = 1;
private static final int UPPER_CASE = 2;
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
private int caseOf(char[] word, int length) {
private WordCase caseOf(char[] word, int length) {
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
return EXACT_CASE;
return WordCase.MIXED;
}
// determine if we are title or lowercase (or something funky, in which it's exact)
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
boolean v = Character.isUpperCase(word[i]);
seenUpper |= v;
seenLower |= !v;
}
if (!seenLower) {
return UPPER_CASE;
} else if (!seenUpper) {
return TITLE_CASE;
} else {
return EXACT_CASE;
}
return WordCase.caseOf(word, length);
}
/** folds titlecase variant of word to titleBuffer */
@ -169,25 +142,20 @@ final class Stemmer {
IntsRef forms = dictionary.lookupWord(word, 0, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
boolean checkNeedAffix = dictionary.needaffix != -1;
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
char[] wordFlags = Dictionary.decodeFlags(scratch);
// we are looking for a case variant, but this word does not allow it
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
continue;
}
// we can't add this form, it's a pseudostem requiring an affix
if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
continue;
}
// we can't add this form, it only belongs inside a compound word
if (checkOnlyInCompound
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
continue;
}
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
char[] wordFlags = Dictionary.decodeFlags(scratch);
if (!acceptCase(caseVariant, wordFlags)) {
continue;
}
// we can't add this form, it's a pseudostem requiring an affix
if (dictionary.needaffix != -1
&& Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
continue;
}
// we can't add this form, it only belongs inside a compound word
if (dictionary.onlyincompound != -1
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
continue;
}
stems.add(newStem(word, length, forms, i));
}
@ -200,6 +168,12 @@ final class Stemmer {
return stems;
}
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
return caseVariant
? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
: !Dictionary.hasHiddenFlag(wordFlags);
}
/**
* Find the unique stem(s) of the provided word
*
@ -595,9 +569,7 @@ final class Stemmer {
}
// we are looking for a case variant, but this word does not allow it
if (caseVariant
&& dictionary.keepcase != -1
&& Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
if (!acceptCase(caseVariant, wordFlags)) {
continue;
}
// we aren't decompounding (yet)

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
enum WordCase {
UPPER,
TITLE,
LOWER,
MIXED;
static WordCase caseOf(char[] word, int length) {
boolean capitalized = Character.isUpperCase(word[0]);
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
char ch = word[i];
seenUpper = seenUpper || Character.isUpperCase(ch);
seenLower = seenLower || Character.isLowerCase(ch);
if (seenUpper && seenLower) break;
}
return get(capitalized, seenUpper, seenLower);
}
static WordCase caseOf(CharSequence word, int length) {
boolean capitalized = Character.isUpperCase(word.charAt(0));
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
char ch = word.charAt(i);
seenUpper = seenUpper || Character.isUpperCase(ch);
seenLower = seenLower || Character.isLowerCase(ch);
if (seenUpper && seenLower) break;
}
return get(capitalized, seenUpper, seenLower);
}
private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
if (capitalized) {
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
}
return seenUpper ? MIXED : LOWER;
}
}

View File

@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import org.junit.BeforeClass;
public class TestAllCaps extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("allcaps.aff", "allcaps.dic");
}
public void testGood() {
assertStemsTo("OpenOffice.org", "OpenOffice.org");
assertStemsTo("UNICEF's", "UNICEF");
// Hunspell returns these title-cased stems, so for consistency we do, too
assertStemsTo("OPENOFFICE.ORG", "Openoffice.org");
assertStemsTo("UNICEF'S", "Unicef");
}
public void testWrong() {
assertStemsTo("Openoffice.org");
assertStemsTo("Unicef");
assertStemsTo("Unicef's");
}
}

View File

@ -27,7 +27,7 @@ public class TestEscaped extends StemmerTestBase {
public void testStemming() {
assertStemsTo("works", "work");
assertStemsTo("work", "work");
assertStemsTo("R2/D2", "R2/D2");
assertStemsTo("R2/D2", "R2/D2", "R2/d2");
assertStemsTo("R2/D2s", "R2/D2");
assertStemsTo("N/A", "N/A");
assertStemsTo("N/As");

View File

@ -0,0 +1,5 @@
# check uppercase forms of allcaps word + affix and words with mixed casing
WORDCHARS '.
SFX S N 1
SFX S 0 's .

View File

@ -0,0 +1,3 @@
2
OpenOffice.org
UNICEF/S

View File

@ -1,4 +1,5 @@
SET UTF-8
WORDCHARS \/0123456789
SFX A Y 1
SFX A 0 s . +PLUR