mirror of https://github.com/apache/lucene.git
LUCENE-9676: Hunspell: improve stemming of all-caps words (#2217)
Hunspell: improve stemming of all-caps words Repeat Hunspell's logic: * when encountering a mixed- or (inflectable) all-case dictionary entry, add its title-case analog as a hidden entry * use that hidden entry for stemming case variants for title- and uppercase words, but don't consider it a valid word itself * ...unless there's another explicit dictionary entry of that title case
This commit is contained in:
parent
c1ae6dc07c
commit
422c89baef
|
@ -84,7 +84,8 @@ API Changes
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
|
* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
|
||||||
|
(Peter Gromov)
|
||||||
|
|
||||||
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||||
(Dawid Weiss)
|
(Dawid Weiss)
|
||||||
|
|
|
@ -73,6 +73,8 @@ public class Dictionary {
|
||||||
|
|
||||||
static final char[] NOFLAGS = new char[0];
|
static final char[] NOFLAGS = new char[0];
|
||||||
|
|
||||||
|
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
|
||||||
|
|
||||||
private static final String ALIAS_KEY = "AF";
|
private static final String ALIAS_KEY = "AF";
|
||||||
private static final String MORPH_ALIAS_KEY = "AM";
|
private static final String MORPH_ALIAS_KEY = "AM";
|
||||||
private static final String PREFIX_KEY = "PFX";
|
private static final String PREFIX_KEY = "PFX";
|
||||||
|
@ -238,10 +240,9 @@ public class Dictionary {
|
||||||
readAffixFile(aff2, decoder);
|
readAffixFile(aff2, decoder);
|
||||||
|
|
||||||
// read dictionary entries
|
// read dictionary entries
|
||||||
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
|
IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
|
||||||
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, o);
|
String sortedFile = sortWordsOffline(tempDir, tempFileNamePrefix, unsorted);
|
||||||
readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, fstCompiler);
|
words = readSortedDictionaries(tempDir, sortedFile);
|
||||||
words = fstCompiler.compile();
|
|
||||||
aliases = null; // no longer needed
|
aliases = null; // no longer needed
|
||||||
morphAliases = null; // no longer needed
|
morphAliases = null; // no longer needed
|
||||||
success = true;
|
success = true;
|
||||||
|
@ -791,25 +792,13 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
private IndexOutput mergeDictionaries(
|
||||||
* Reads the dictionary file through the provided InputStreams, building up the words map
|
|
||||||
*
|
|
||||||
* @param dictionaries InputStreams to read the dictionary file through
|
|
||||||
* @param decoder CharsetDecoder used to decode the contents of the file
|
|
||||||
* @throws IOException Can be thrown while reading from the file
|
|
||||||
*/
|
|
||||||
private void readDictionaryFiles(
|
|
||||||
Directory tempDir,
|
Directory tempDir,
|
||||||
String tempFileNamePrefix,
|
String tempFileNamePrefix,
|
||||||
List<InputStream> dictionaries,
|
List<InputStream> dictionaries,
|
||||||
CharsetDecoder decoder,
|
CharsetDecoder decoder)
|
||||||
FSTCompiler<IntsRef> words)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
BytesRefBuilder flagsScratch = new BytesRefBuilder();
|
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
|
||||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
|
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
|
||||||
for (InputStream dictionary : dictionaries) {
|
for (InputStream dictionary : dictionaries) {
|
||||||
|
@ -833,32 +822,58 @@ public class Dictionary {
|
||||||
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
|
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (needsInputCleaning) {
|
|
||||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
writeNormalizedWordEntry(sb, writer, line);
|
||||||
if (flagSep == -1) {
|
|
||||||
flagSep = line.indexOf(MORPH_SEPARATOR);
|
|
||||||
}
|
|
||||||
if (flagSep == -1) {
|
|
||||||
CharSequence cleansed = cleanInput(line, sb);
|
|
||||||
writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
|
|
||||||
} else {
|
|
||||||
String text = line.substring(0, flagSep);
|
|
||||||
CharSequence cleansed = cleanInput(text, sb);
|
|
||||||
if (cleansed != sb) {
|
|
||||||
sb.setLength(0);
|
|
||||||
sb.append(cleansed);
|
|
||||||
}
|
|
||||||
sb.append(line.substring(flagSep));
|
|
||||||
writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
writer.write(line.getBytes(StandardCharsets.UTF_8));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
CodecUtil.writeFooter(unsorted);
|
CodecUtil.writeFooter(unsorted);
|
||||||
}
|
}
|
||||||
|
return unsorted;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeNormalizedWordEntry(
|
||||||
|
StringBuilder reuse, ByteSequencesWriter writer, String line) throws IOException {
|
||||||
|
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||||
|
int morphSep = line.indexOf(MORPH_SEPARATOR);
|
||||||
|
assert morphSep > 0;
|
||||||
|
assert morphSep > flagSep;
|
||||||
|
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||||
|
|
||||||
|
CharSequence toWrite;
|
||||||
|
if (needsInputCleaning) {
|
||||||
|
cleanInput(line, sep, reuse);
|
||||||
|
reuse.append(line, sep, line.length());
|
||||||
|
toWrite = reuse;
|
||||||
|
} else {
|
||||||
|
toWrite = line;
|
||||||
|
}
|
||||||
|
|
||||||
|
String written = toWrite.toString();
|
||||||
|
sep = written.length() - (line.length() - sep);
|
||||||
|
writer.write(written.getBytes(StandardCharsets.UTF_8));
|
||||||
|
|
||||||
|
WordCase wordCase = WordCase.caseOf(written, sep);
|
||||||
|
if (wordCase == WordCase.MIXED || wordCase == WordCase.UPPER && flagSep > 0) {
|
||||||
|
addHiddenCapitalizedWord(reuse, writer, written.substring(0, sep), written.substring(sep));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addHiddenCapitalizedWord(
|
||||||
|
StringBuilder reuse, ByteSequencesWriter writer, String word, String afterSep)
|
||||||
|
throws IOException {
|
||||||
|
reuse.setLength(0);
|
||||||
|
reuse.append(Character.toUpperCase(word.charAt(0)));
|
||||||
|
for (int i = 1; i < word.length(); i++) {
|
||||||
|
reuse.append(caseFold(word.charAt(i)));
|
||||||
|
}
|
||||||
|
reuse.append(FLAG_SEPARATOR);
|
||||||
|
reuse.append(HIDDEN_FLAG);
|
||||||
|
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||||
|
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String sortWordsOffline(
|
||||||
|
Directory tempDir, String tempFileNamePrefix, IndexOutput unsorted) throws IOException {
|
||||||
OfflineSorter sorter =
|
OfflineSorter sorter =
|
||||||
new OfflineSorter(
|
new OfflineSorter(
|
||||||
tempDir,
|
tempDir,
|
||||||
|
@ -908,8 +923,13 @@ public class Dictionary {
|
||||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
|
IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return sorted;
|
||||||
|
}
|
||||||
|
|
||||||
boolean success2 = false;
|
private FST<IntsRef> readSortedDictionaries(Directory tempDir, String sorted) throws IOException {
|
||||||
|
boolean success = false;
|
||||||
|
|
||||||
|
EntryGrouper grouper = new EntryGrouper();
|
||||||
|
|
||||||
try (ByteSequencesReader reader =
|
try (ByteSequencesReader reader =
|
||||||
new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
|
new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
|
||||||
|
@ -917,9 +937,6 @@ public class Dictionary {
|
||||||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||||
// either way the trick is to encode them as char... but they must be parsed differently
|
// either way the trick is to encode them as char... but they must be parsed differently
|
||||||
|
|
||||||
String currentEntry = null;
|
|
||||||
IntsRefBuilder currentOrds = new IntsRefBuilder();
|
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
BytesRef scratch = reader.next();
|
BytesRef scratch = reader.next();
|
||||||
if (scratch == null) {
|
if (scratch == null) {
|
||||||
|
@ -959,42 +976,15 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
|
grouper.add(entry, wordForm, stemExceptionID);
|
||||||
if (cmp < 0) {
|
|
||||||
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
|
|
||||||
} else {
|
|
||||||
encodeFlags(flagsScratch, wordForm);
|
|
||||||
int ord = flagLookup.add(flagsScratch.get());
|
|
||||||
if (ord < 0) {
|
|
||||||
// already exists in our hash
|
|
||||||
ord = (-ord) - 1;
|
|
||||||
}
|
|
||||||
// finalize current entry, and switch "current" if necessary
|
|
||||||
if (cmp > 0 && currentEntry != null) {
|
|
||||||
Util.toUTF32(currentEntry, scratchInts);
|
|
||||||
words.add(scratchInts.get(), currentOrds.get());
|
|
||||||
}
|
|
||||||
// swap current
|
|
||||||
if (cmp > 0) {
|
|
||||||
currentEntry = entry;
|
|
||||||
currentOrds = new IntsRefBuilder(); // must be this way
|
|
||||||
}
|
|
||||||
if (hasStemExceptions) {
|
|
||||||
currentOrds.append(ord);
|
|
||||||
currentOrds.append(stemExceptionID);
|
|
||||||
} else {
|
|
||||||
currentOrds.append(ord);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// finalize last entry
|
// finalize last entry
|
||||||
assert currentEntry != null;
|
grouper.flushGroup();
|
||||||
Util.toUTF32(currentEntry, scratchInts);
|
success = true;
|
||||||
words.add(scratchInts.get(), currentOrds.get());
|
return grouper.words.compile();
|
||||||
success2 = true;
|
|
||||||
} finally {
|
} finally {
|
||||||
if (success2) {
|
if (success) {
|
||||||
tempDir.deleteFile(sorted);
|
tempDir.deleteFile(sorted);
|
||||||
} else {
|
} else {
|
||||||
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
|
IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
|
||||||
|
@ -1002,6 +992,72 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class EntryGrouper {
|
||||||
|
final FSTCompiler<IntsRef> words =
|
||||||
|
new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, IntSequenceOutputs.getSingleton());
|
||||||
|
private final List<char[]> group = new ArrayList<>();
|
||||||
|
private final List<Integer> stemExceptionIDs = new ArrayList<>();
|
||||||
|
private final BytesRefBuilder flagsScratch = new BytesRefBuilder();
|
||||||
|
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
|
private String currentEntry = null;
|
||||||
|
|
||||||
|
void add(String entry, char[] flags, int stemExceptionID) throws IOException {
|
||||||
|
if (!entry.equals(currentEntry)) {
|
||||||
|
if (currentEntry != null) {
|
||||||
|
if (entry.compareTo(currentEntry) < 0) {
|
||||||
|
throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
|
||||||
|
}
|
||||||
|
flushGroup();
|
||||||
|
}
|
||||||
|
currentEntry = entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
group.add(flags);
|
||||||
|
if (hasStemExceptions) {
|
||||||
|
stemExceptionIDs.add(stemExceptionID);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void flushGroup() throws IOException {
|
||||||
|
IntsRefBuilder currentOrds = new IntsRefBuilder();
|
||||||
|
|
||||||
|
boolean hasNonHidden = false;
|
||||||
|
for (char[] flags : group) {
|
||||||
|
if (!hasHiddenFlag(flags)) {
|
||||||
|
hasNonHidden = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < group.size(); i++) {
|
||||||
|
char[] flags = group.get(i);
|
||||||
|
if (hasNonHidden && hasHiddenFlag(flags)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
encodeFlags(flagsScratch, flags);
|
||||||
|
int ord = flagLookup.add(flagsScratch.get());
|
||||||
|
if (ord < 0) {
|
||||||
|
ord = -ord - 1; // already exists in our hash
|
||||||
|
}
|
||||||
|
currentOrds.append(ord);
|
||||||
|
if (hasStemExceptions) {
|
||||||
|
currentOrds.append(stemExceptionIDs.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Util.toUTF32(currentEntry, scratchInts);
|
||||||
|
words.add(scratchInts.get(), currentOrds.get());
|
||||||
|
|
||||||
|
group.clear();
|
||||||
|
stemExceptionIDs.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean hasHiddenFlag(char[] flags) {
|
||||||
|
return hasFlag(flags, HIDDEN_FLAG);
|
||||||
|
}
|
||||||
|
|
||||||
static char[] decodeFlags(BytesRef b) {
|
static char[] decodeFlags(BytesRef b) {
|
||||||
if (b.length == 0) {
|
if (b.length == 0) {
|
||||||
return CharsRef.EMPTY_CHARS;
|
return CharsRef.EMPTY_CHARS;
|
||||||
|
@ -1191,9 +1247,13 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
|
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
|
||||||
|
return cleanInput(input, input.length(), reuse);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
|
||||||
reuse.setLength(0);
|
reuse.setLength(0);
|
||||||
|
|
||||||
for (int i = 0; i < input.length(); i++) {
|
for (int i = 0; i < prefixLength; i++) {
|
||||||
char ch = input.charAt(i);
|
char ch = input.charAt(i);
|
||||||
|
|
||||||
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
|
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
|
||||||
|
|
|
@ -95,57 +95,30 @@ final class Stemmer {
|
||||||
word = scratchBuffer;
|
word = scratchBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
int caseType = caseOf(word, length);
|
WordCase wordCase = caseOf(word, length);
|
||||||
if (caseType == UPPER_CASE) {
|
List<CharsRef> list = doStem(word, length, false);
|
||||||
// upper: union exact, title, lower
|
if (wordCase == WordCase.UPPER) {
|
||||||
caseFoldTitle(word, length);
|
caseFoldTitle(word, length);
|
||||||
caseFoldLower(titleBuffer, length);
|
|
||||||
List<CharsRef> list = doStem(word, length, false);
|
|
||||||
list.addAll(doStem(titleBuffer, length, true));
|
list.addAll(doStem(titleBuffer, length, true));
|
||||||
list.addAll(doStem(lowerBuffer, length, true));
|
|
||||||
return list;
|
|
||||||
} else if (caseType == TITLE_CASE) {
|
|
||||||
// title: union exact, lower
|
|
||||||
caseFoldLower(word, length);
|
|
||||||
List<CharsRef> list = doStem(word, length, false);
|
|
||||||
list.addAll(doStem(lowerBuffer, length, true));
|
|
||||||
return list;
|
|
||||||
} else {
|
|
||||||
// exact match only
|
|
||||||
return doStem(word, length, false);
|
|
||||||
}
|
}
|
||||||
|
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||||
|
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||||
|
list.addAll(doStem(lowerBuffer, length, true));
|
||||||
|
}
|
||||||
|
return list;
|
||||||
}
|
}
|
||||||
|
|
||||||
// temporary buffers for case variants
|
// temporary buffers for case variants
|
||||||
private char[] lowerBuffer = new char[8];
|
private char[] lowerBuffer = new char[8];
|
||||||
private char[] titleBuffer = new char[8];
|
private char[] titleBuffer = new char[8];
|
||||||
|
|
||||||
private static final int EXACT_CASE = 0;
|
|
||||||
private static final int TITLE_CASE = 1;
|
|
||||||
private static final int UPPER_CASE = 2;
|
|
||||||
|
|
||||||
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
||||||
private int caseOf(char[] word, int length) {
|
private WordCase caseOf(char[] word, int length) {
|
||||||
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
|
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
|
||||||
return EXACT_CASE;
|
return WordCase.MIXED;
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine if we are title or lowercase (or something funky, in which it's exact)
|
return WordCase.caseOf(word, length);
|
||||||
boolean seenUpper = false;
|
|
||||||
boolean seenLower = false;
|
|
||||||
for (int i = 1; i < length; i++) {
|
|
||||||
boolean v = Character.isUpperCase(word[i]);
|
|
||||||
seenUpper |= v;
|
|
||||||
seenLower |= !v;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!seenLower) {
|
|
||||||
return UPPER_CASE;
|
|
||||||
} else if (!seenUpper) {
|
|
||||||
return TITLE_CASE;
|
|
||||||
} else {
|
|
||||||
return EXACT_CASE;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** folds titlecase variant of word to titleBuffer */
|
/** folds titlecase variant of word to titleBuffer */
|
||||||
|
@ -169,26 +142,21 @@ final class Stemmer {
|
||||||
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
||||||
if (forms != null) {
|
if (forms != null) {
|
||||||
for (int i = 0; i < forms.length; i += formStep) {
|
for (int i = 0; i < forms.length; i += formStep) {
|
||||||
boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
|
|
||||||
boolean checkNeedAffix = dictionary.needaffix != -1;
|
|
||||||
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
|
|
||||||
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
|
|
||||||
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
|
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
|
||||||
char[] wordFlags = Dictionary.decodeFlags(scratch);
|
char[] wordFlags = Dictionary.decodeFlags(scratch);
|
||||||
// we are looking for a case variant, but this word does not allow it
|
if (!acceptCase(caseVariant, wordFlags)) {
|
||||||
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// we can't add this form, it's a pseudostem requiring an affix
|
// we can't add this form, it's a pseudostem requiring an affix
|
||||||
if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
|
if (dictionary.needaffix != -1
|
||||||
|
&& Dictionary.hasFlag(wordFlags, (char) dictionary.needaffix)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// we can't add this form, it only belongs inside a compound word
|
// we can't add this form, it only belongs inside a compound word
|
||||||
if (checkOnlyInCompound
|
if (dictionary.onlyincompound != -1
|
||||||
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
|
&& Dictionary.hasFlag(wordFlags, (char) dictionary.onlyincompound)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
stems.add(newStem(word, length, forms, i));
|
stems.add(newStem(word, length, forms, i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -200,6 +168,12 @@ final class Stemmer {
|
||||||
return stems;
|
return stems;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
|
||||||
|
return caseVariant
|
||||||
|
? dictionary.keepcase == -1 || !Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)
|
||||||
|
: !Dictionary.hasHiddenFlag(wordFlags);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find the unique stem(s) of the provided word
|
* Find the unique stem(s) of the provided word
|
||||||
*
|
*
|
||||||
|
@ -595,9 +569,7 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// we are looking for a case variant, but this word does not allow it
|
// we are looking for a case variant, but this word does not allow it
|
||||||
if (caseVariant
|
if (!acceptCase(caseVariant, wordFlags)) {
|
||||||
&& dictionary.keepcase != -1
|
|
||||||
&& Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// we aren't decompounding (yet)
|
// we aren't decompounding (yet)
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
enum WordCase {
|
||||||
|
UPPER,
|
||||||
|
TITLE,
|
||||||
|
LOWER,
|
||||||
|
MIXED;
|
||||||
|
|
||||||
|
static WordCase caseOf(char[] word, int length) {
|
||||||
|
boolean capitalized = Character.isUpperCase(word[0]);
|
||||||
|
|
||||||
|
boolean seenUpper = false;
|
||||||
|
boolean seenLower = false;
|
||||||
|
for (int i = 1; i < length; i++) {
|
||||||
|
char ch = word[i];
|
||||||
|
seenUpper = seenUpper || Character.isUpperCase(ch);
|
||||||
|
seenLower = seenLower || Character.isLowerCase(ch);
|
||||||
|
if (seenUpper && seenLower) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return get(capitalized, seenUpper, seenLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
static WordCase caseOf(CharSequence word, int length) {
|
||||||
|
boolean capitalized = Character.isUpperCase(word.charAt(0));
|
||||||
|
|
||||||
|
boolean seenUpper = false;
|
||||||
|
boolean seenLower = false;
|
||||||
|
for (int i = 1; i < length; i++) {
|
||||||
|
char ch = word.charAt(i);
|
||||||
|
seenUpper = seenUpper || Character.isUpperCase(ch);
|
||||||
|
seenLower = seenLower || Character.isLowerCase(ch);
|
||||||
|
if (seenUpper && seenLower) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return get(capitalized, seenUpper, seenLower);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
|
||||||
|
if (capitalized) {
|
||||||
|
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
|
||||||
|
}
|
||||||
|
return seenUpper ? MIXED : LOWER;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,42 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
public class TestAllCaps extends StemmerTestBase {
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
init("allcaps.aff", "allcaps.dic");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGood() {
|
||||||
|
assertStemsTo("OpenOffice.org", "OpenOffice.org");
|
||||||
|
assertStemsTo("UNICEF's", "UNICEF");
|
||||||
|
|
||||||
|
// Hunspell returns these title-cased stems, so for consistency we do, too
|
||||||
|
assertStemsTo("OPENOFFICE.ORG", "Openoffice.org");
|
||||||
|
assertStemsTo("UNICEF'S", "Unicef");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWrong() {
|
||||||
|
assertStemsTo("Openoffice.org");
|
||||||
|
assertStemsTo("Unicef");
|
||||||
|
assertStemsTo("Unicef's");
|
||||||
|
}
|
||||||
|
}
|
|
@ -27,7 +27,7 @@ public class TestEscaped extends StemmerTestBase {
|
||||||
public void testStemming() {
|
public void testStemming() {
|
||||||
assertStemsTo("works", "work");
|
assertStemsTo("works", "work");
|
||||||
assertStemsTo("work", "work");
|
assertStemsTo("work", "work");
|
||||||
assertStemsTo("R2/D2", "R2/D2");
|
assertStemsTo("R2/D2", "R2/D2", "R2/d2");
|
||||||
assertStemsTo("R2/D2s", "R2/D2");
|
assertStemsTo("R2/D2s", "R2/D2");
|
||||||
assertStemsTo("N/A", "N/A");
|
assertStemsTo("N/A", "N/A");
|
||||||
assertStemsTo("N/As");
|
assertStemsTo("N/As");
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
# check uppercase forms of allcaps word + affix and words with mixed casing
|
||||||
|
WORDCHARS '.
|
||||||
|
|
||||||
|
SFX S N 1
|
||||||
|
SFX S 0 's .
|
|
@ -0,0 +1,3 @@
|
||||||
|
2
|
||||||
|
OpenOffice.org
|
||||||
|
UNICEF/S
|
|
@ -1,4 +1,5 @@
|
||||||
SET UTF-8
|
SET UTF-8
|
||||||
|
WORDCHARS \/0123456789
|
||||||
|
|
||||||
SFX A Y 1
|
SFX A Y 1
|
||||||
SFX A 0 s . +PLUR
|
SFX A 0 s . +PLUR
|
||||||
|
|
Loading…
Reference in New Issue