mirror of https://github.com/apache/lucene.git
LUCENE-9676: Hunspell: improve stemming of all-caps words (#2217)
Hunspell: improve stemming of all-caps words Repeat Hunspell's logic: * when encountering a mixed- or (inflectable) all-case dictionary entry, add its title-case analog as a hidden entry * use that hidden entry for stemming case variants for title- and uppercase words, but don't consider it a valid word itself * ...unless there's another explicit dictionary entry of that title case
This commit is contained in:
parent
c1ae6dc07c
commit
422c89baef
|
@ -84,7 +84,8 @@ API Changes
|
|||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
|
||||
* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
|
||||
(Peter Gromov)
|
||||
|
||||
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||
(Dawid Weiss)
|
||||
|
|
|
@ -73,6 +73,8 @@ public class Dictionary {
|
|||
|
||||
static final char[] NOFLAGS = new char[0];
|
||||
|
||||
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
|
||||
|
||||
private static final String ALIAS_KEY = "AF";
|
||||
private static final String MORPH_ALIAS_KEY = "AM";
|
||||
private static final String PREFIX_KEY = "PFX";
|
||||
|
@ -238,10 +240,9 @@ public class Dictionary {
|
|||
readAffixFile(aff2, decoder);
|
||||
|
||||
// read dictionary entries
|
||||
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
|
||||
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
|
||||
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
|
||||
words = fstCompiler.compile();
|
||||
IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
|
||||
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
|
||||
words = readSortedDictionaries(tempDir, sortedFile);
|
||||
aliases = null; // no longer needed
|
||||
morphAliases = null; // no longer needed
|
||||
success = true;
|
||||
|
@ -791,25 +792,13 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the dictionary file through the provided InputStreams, building up the words map
|
||||
*
|
||||
* @param dictionaries InputStreams to read the dictionary file through
|
||||
* @param decoder CharsetDecoder used to decode the contents of the file
|
||||
* @throws IOException Can be thrown while reading from the file
|
||||
*/
|
||||
private void readDictionaryFiles(
|
||||
private IndexOutput mergeDictionaries(
|
||||
Directory tempDir,
|
||||
String tempFileNamePrefix,
|
||||
List<InputStream> dictionaries,
|
||||
CharsetDecoder decoder,
|
||||
FSTCompiler<IntsRef> words)
|
||||
CharsetDecoder decoder)
|
||||
throws IOException {
|
||||
BytesRefBuilder flagsScratch = new BytesRefBuilder();
|
||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
|
@ -833,32 +822,58 @@ public class Dictionary {
|
|||
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
|
||||
}
|
||||
}
|
||||
if (needsInputCleaning) {
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
if (flagSep == -1) {
|
||||
flagSep = line.indexOf(MORPH_SEPARATOR);
|
||||
}
|
||||
if (flagSep == -1) {
|
||||
CharSequence cleansed = cleanInput(line, sb);
|
||||
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
|
||||
} else {
|
||||
String text = line.substring(0, flagSep);
|
||||
CharSequence cleansed = cleanInput(text, sb);
|
||||
if (cleansed != sb) {
|
||||
sb.setLength(0);
|
||||
sb.append(cleansed);
|
||||
}
|
||||
sb.append(line.substring(flagSep));
|
||||
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
} else {
|
||||
writer.write(line.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
writeNormalizedWordEntry(sb, writer, line);
|
||||
}
|
||||
}
|
||||
CodecUtil.writeFooter(unsorted);
|
||||
}
|
||||
return unsorted;
|
||||
}
|
||||
|
||||
private void writeNormalizedWordEntry(
|
||||
StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
||||
assert morphSep > 0;
|
||||
assert morphSep > flagSep;
|
||||
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||
|
||||
CharSequence toWrite;
|
||||
if (needsInputCleaning) {
|
||||
cleanInput(line, sep, reuse);
|
||||
reuse.append(line, sep, line.length());
|
||||
toWrite = reuse;
|
||||
} else {
|
||||
toWrite = line;
|
||||
}
|
||||
|
||||
String written = toWrite.toString();
|
||||
sep = written.length() - (line.length() - sep);
|
||||
writer.write(written.getBytes(StandardCharsets.UTF_8));
|
||||
|
||||
WordCase wordCase = WordCase.caseOf(written, sep);
|
||||
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
||||
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
|
||||
}
|
||||
}
|
||||
|
||||
private void addHiddenCapitalizedWord(
|
||||
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
|
||||
throws IOException {
|
||||
reuse.setLength(0);
|
||||
reuse.append(Character.toUpperCase(word.charAt(0)));
|
||||
for (int i = 1; i < word.length(); i++) {
|
||||
reuse.append(caseFold(word.charAt(i)));
|
||||
}
|
||||
reuse.append(FLAG_SEPARATOR);
|
||||
reuse.append(HIDDEN_FLAG);
|
||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
private String sortWordsOffline(
|
||||
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
||||
OfflineSorter sorter =
|
||||
new OfflineSorter(
|
||||
tempDir,
|
||||
|
@ -908,8 +923,13 @@ public class Dictionary {
|
|||
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
|
||||
}
|
||||
}
|
||||
return sorted;
|
||||
}
|
||||
|
||||
boolean success2 = false;
|
||||
private FST<IntsRef> readSortedDictionaries(Directory tempDir, String sorted) throws IOException {
|
||||
boolean success = false;
|
||||
|
||||
EntryGrouper grouper = new EntryGrouper();
|
||||
|
||||
try (ByteSequencesReader reader =
|
||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
|
||||
|
@ -917,9 +937,6 @@ public class Dictionary {
|
|||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||
// either way the trick is to encode them as char... but they must be parsed differently
|
||||
|
||||
String currentEntry = null;
|
||||
IntsRefBuilder currentOrds = new IntsRefBuilder();
|
||||
|
||||
while (true) {
|
||||
BytesRef scratch = reader.next();
|
||||
if (scratch == null) {
|
||||
|
@ -959,42 +976,15 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
|
||||
if (cmp < 0) {
|
||||
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
|
||||
} else {
|
||||
encodeFlags(flagsScratch, wordForm);
|
||||
int ord = flagLookup.add(flagsScratch.get());
|
||||
if (ord < 0) {
|
||||
// already exists in our hash
|
||||
ord = (-ord) - 1;
|
||||
}
|
||||
// finalize current entry, and switch "current" if necessary
|
||||
if (cmp > 0 && currentEntry != null) {
|
||||
Util.toUTF32(currentEntry, scratchInts);
|
||||
words.add(scratchInts.get(), currentOrds.get());
|
||||
}
|
||||
// swap current
|
||||
if (cmp > 0) {
|
||||
currentEntry = entry;
|
||||
currentOrds = new IntsRefBuilder(); // must be this way
|
||||
}
|
||||
if (hasStemExceptions) {
|
||||
currentOrds.append(ord);
|
||||
currentOrds.append(stemExceptionID);
|
||||
} else {
|
||||
currentOrds.append(ord);
|
||||
}
|
||||
}
|
||||
grouper.add(entry, wordForm, stemExceptionID);
|
||||
}
|
||||
|
||||
// finalize last entry
|
||||
assert currentEntry != null;
|
||||
Util.toUTF32(currentEntry, scratchInts);
|
||||
words.add(scratchInts.get(), currentOrds.get());
|
||||
success2 = true;
|
||||
grouper.flushGroup();
|
||||
success = true;
|
||||
return grouper.words.compile();
|
||||
} finally {
|
||||
if (success2) {
|
||||
if (success) {
|
||||
tempDir.deleteFile(sorted);
|
||||
} else {
|
||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
|
||||
|
@ -1002,6 +992,72 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
private class EntryGrouper {
|
||||
final FSTCompiler<IntsRef> words =
|
||||
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
|
||||
private final List<char[]> group = new ArrayList<>();
|
||||
private final List<Integer> stemExceptionIDs = new ArrayList<>();
|
||||
private final BytesRefBuilder flagsScratch = new BytesRefBuilder();
|
||||
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
private String currentEntry = null;
|
||||
|
||||
void add(String entry, char[] flags, int stemExceptionID) throws IOException {
|
||||
if (!entry.equals(currentEntry)) {
|
||||
if (currentEntry != null) {
|
||||
if (entry.compareTo(currentEntry) < 0) {
|
||||
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
|
||||
}
|
||||
flushGroup();
|
||||
}
|
||||
currentEntry = entry;
|
||||
}
|
||||
|
||||
group.add(flags);
|
||||
if (hasStemExceptions) {
|
||||
stemExceptionIDs.add(stemExceptionID);
|
||||
}
|
||||
}
|
||||
|
||||
void flushGroup() throws IOException {
|
||||
IntsRefBuilder currentOrds = new IntsRefBuilder();
|
||||
|
||||
boolean hasNonHidden = false;
|
||||
for (char[] flags : group) {
|
||||
if (!hasHiddenFlag(flags)) {
|
||||
hasNonHidden = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < group.size(); i++) {
|
||||
char[] flags = group.get(i);
|
||||
if (hasNonHidden && hasHiddenFlag(flags)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
encodeFlags(flagsScratch, flags);
|
||||
int ord = flagLookup.add(flagsScratch.get());
|
||||
if (ord < 0) {
|
||||
ord = -ord - 1; // already exists in our hash
|
||||
}
|
||||
currentOrds.append(ord);
|
||||
if (hasStemExceptions) {
|
||||
currentOrds.append(stemExceptionIDs.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
Util.toUTF32(currentEntry, scratchInts);
|
||||
words.add(scratchInts.get(), currentOrds.get());
|
||||
|
||||
group.clear();
|
||||
stemExceptionIDs.clear();
|
||||
}
|
||||
}
|
||||
|
||||
static boolean hasHiddenFlag(char[] flags) {
|
||||
return hasFlag(flags, HIDDEN_FLAG);
|
||||
}
|
||||
|
||||
static char[] decodeFlags(BytesRef b) {
|
||||
if (b.length == 0) {
|
||||
return CharsRef.EMPTY_CHARS;
|
||||
|
@ -1191,9 +1247,13 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
|
||||
return cleanInput(input, input.length(), reuse);
|
||||
}
|
||||
|
||||
private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
|
||||
reuse.setLength(0);
|
||||
|
||||
for (int i = 0; i < input.length(); i++) {
|
||||
for (int i = 0; i < prefixLength; i++) {
|
||||
char ch = input.charAt(i);
|
||||
|
||||
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
|
||||
|
|
|
@ -95,57 +95,30 @@ final class Stemmer {
|
|||
word = scratchBuffer;
|
||||
}
|
||||
|
||||
int caseType = caseOf(word, length);
|
||||
if (caseType == UPPER_CASE) {
|
||||
// upper: union exact, title, lower
|
||||
WordCase wordCase = caseOf(word, length);
|
||||
List<CharsRef> list = doStem(word, length, false);
|
||||
if (wordCase == WordCase.UPPER) {
|
||||
caseFoldTitle(word, length);
|
||||
caseFoldLower(titleBuffer, length);
|
||||
List<CharsRef> list = doStem(word, length, false);
|
||||
list.addAll(doStem(titleBuffer, length, true));
|
||||
list.addAll(doStem(lowerBuffer, length, true));
|
||||
return list;
|
||||
} else if (caseType == TITLE_CASE) {
|
||||
// title: union exact, lower
|
||||
caseFoldLower(word, length);
|
||||
List<CharsRef> list = doStem(word, length, false);
|
||||
list.addAll(doStem(lowerBuffer, length, true));
|
||||
return list;
|
||||
} else {
|
||||
// exact match only
|
||||
return doStem(word, length, false);
|
||||
}
|
||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||
list.addAll(doStem(lowerBuffer, length, true));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
// temporary buffers for case variants
|
||||
private char[] lowerBuffer = new char[8];
|
||||
private char[] titleBuffer = new char[8];
|
||||
|
||||
private static final int EXACT_CASE = 0;
|
||||
private static final int TITLE_CASE = 1;
|
||||
private static final int UPPER_CASE = 2;
|
||||
|
||||
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
||||
private int caseOf(char[] word, int length) {
|
||||
private WordCase caseOf(char[] word, int length) {
|
||||
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
|
||||
return EXACT_CASE;
|
||||
return WordCase.MIXED;
|
||||
}
|
||||
|
||||
// determine if we are title or lowercase (or something funky, in which it's exact)
|
||||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
for (int i = 1; i < length; i++) {
|
||||
boolean v = Character.isUpperCase(word[i]);
|
||||
seenUpper |= v;
|
||||
seenLower |= !v;
|
||||
}
|
||||
|
||||
if (!seenLower) {
|
||||
return UPPER_CASE;
|
||||
} else if (!seenUpper) {
|
||||
return TITLE_CASE;
|
||||
} else {
|
||||
return EXACT_CASE;
|
||||
}
|
||||
return WordCase.caseOf(word, length);
|
||||
}
|
||||
|
||||
/** folds titlecase variant of word to titleBuffer */
|
||||
|
@ -169,25 +142,20 @@ final class Stemmer {
|
|||
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
||||
if (forms != null) {
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
|
||||
boolean checkNeedAffix = dictionary.needaffix != -1;
|
||||
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
|
||||
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
|
||||
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
|
||||
char[] wordFlags = Dictionary.decodeFlags(scratch);
|
||||
// we are looking for a case variant, but this word does not allow it
|
||||
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
|
||||
continue;
|
||||
}
|
||||
// we can't add this form, it's a pseudostem requiring an affix
|
||||
if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
|
||||
continue;
|
||||
}
|
||||
// we can't add this form, it only belongs inside a compound word
|
||||
if (checkOnlyInCompound
|
||||
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
|
||||
continue;
|
||||
}
|
||||
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
|
||||
char[] wordFlags = Dictionary.decodeFlags(scratch);
|
||||
if (!acceptCase(caseVariant, wordFlags)) {
|
||||
continue;
|
||||
}
|
||||
// we can't add this form, it's a pseudostem requiring an affix
|
||||
if (dictionary.needaffix != -1
|
||||
&& Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
|
||||
continue;
|
||||
}
|
||||
// we can't add this form, it only belongs inside a compound word
|
||||
if (dictionary.onlyincompound != -1
|
||||
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
|
||||
continue;
|
||||
}
|
||||
stems.add(newStem(word, length, forms, i));
|
||||
}
|
||||
|
@ -200,6 +168,12 @@ final class Stemmer {
|
|||
return stems;
|
||||
}
|
||||
|
||||
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
|
||||
return caseVariant
|
||||
? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
|
||||
: !Dictionary.hasHiddenFlag(wordFlags);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the unique stem(s) of the provided word
|
||||
*
|
||||
|
@ -595,9 +569,7 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
// we are looking for a case variant, but this word does not allow it
|
||||
if (caseVariant
|
||||
&& dictionary.keepcase != -1
|
||||
&& Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
|
||||
if (!acceptCase(caseVariant, wordFlags)) {
|
||||
continue;
|
||||
}
|
||||
// we aren't decompounding (yet)
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
enum WordCase {
|
||||
UPPER,
|
||||
TITLE,
|
||||
LOWER,
|
||||
MIXED;
|
||||
|
||||
static WordCase caseOf(char[] word, int length) {
|
||||
boolean capitalized = Character.isUpperCase(word[0]);
|
||||
|
||||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
for (int i = 1; i < length; i++) {
|
||||
char ch = word[i];
|
||||
seenUpper = seenUpper || Character.isUpperCase(ch);
|
||||
seenLower = seenLower || Character.isLowerCase(ch);
|
||||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
return get(capitalized, seenUpper, seenLower);
|
||||
}
|
||||
|
||||
static WordCase caseOf(CharSequence word, int length) {
|
||||
boolean capitalized = Character.isUpperCase(word.charAt(0));
|
||||
|
||||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
for (int i = 1; i < length; i++) {
|
||||
char ch = word.charAt(i);
|
||||
seenUpper = seenUpper || Character.isUpperCase(ch);
|
||||
seenLower = seenLower || Character.isLowerCase(ch);
|
||||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
return get(capitalized, seenUpper, seenLower);
|
||||
}
|
||||
|
||||
private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
|
||||
if (capitalized) {
|
||||
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
|
||||
}
|
||||
return seenUpper ? MIXED : LOWER;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestAllCaps extends StemmerTestBase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
init("allcaps.aff", "allcaps.dic");
|
||||
}
|
||||
|
||||
public void testGood() {
|
||||
assertStemsTo("OpenOffice.org", "OpenOffice.org");
|
||||
assertStemsTo("UNICEF's", "UNICEF");
|
||||
|
||||
// Hunspell returns these title-cased stems, so for consistency we do, too
|
||||
assertStemsTo("OPENOFFICE.ORG", "Openoffice.org");
|
||||
assertStemsTo("UNICEF'S", "Unicef");
|
||||
}
|
||||
|
||||
public void testWrong() {
|
||||
assertStemsTo("Openoffice.org");
|
||||
assertStemsTo("Unicef");
|
||||
assertStemsTo("Unicef's");
|
||||
}
|
||||
}
|
|
@ -27,7 +27,7 @@ public class TestEscaped extends StemmerTestBase {
|
|||
public void testStemming() {
|
||||
assertStemsTo("works", "work");
|
||||
assertStemsTo("work", "work");
|
||||
assertStemsTo("R2/D2", "R2/D2");
|
||||
assertStemsTo("R2/D2", "R2/D2", "R2/d2");
|
||||
assertStemsTo("R2/D2s", "R2/D2");
|
||||
assertStemsTo("N/A", "N/A");
|
||||
assertStemsTo("N/As");
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
# check uppercase forms of allcaps word + affix and words with mixed casing
|
||||
WORDCHARS '.
|
||||
|
||||
SFX S N 1
|
||||
SFX S 0 's .
|
|
@ -0,0 +1,3 @@
|
|||
2
|
||||
OpenOffice.org
|
||||
UNICEF/S
|
|
@ -1,4 +1,5 @@
|
|||
SET UTF-8
|
||||
WORDCHARS \/0123456789
|
||||
|
||||
SFX A Y 1
|
||||
SFX A 0 s . +PLUR
|
||||
|
|
Loading…
Reference in New Issue