mirror of https://github.com/apache/lucene.git
LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)
This commit is contained in:
parent
cf5db8d651
commit
d7968130c3
|
@ -86,8 +86,8 @@ API Changes
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
|
* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
|
||||||
BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
|
BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
|
||||||
|
|
||||||
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||||
(Dawid Weiss)
|
(Dawid Weiss)
|
||||||
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
class CompoundRule {
|
||||||
|
private final char[] data;
|
||||||
|
private final Dictionary dictionary;
|
||||||
|
|
||||||
|
CompoundRule(String rule, Dictionary dictionary) {
|
||||||
|
this.dictionary = dictionary;
|
||||||
|
StringBuilder parsedFlags = new StringBuilder();
|
||||||
|
int pos = 0;
|
||||||
|
while (pos < rule.length()) {
|
||||||
|
int lParen = rule.indexOf("(", pos);
|
||||||
|
if (lParen < 0) {
|
||||||
|
parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
|
||||||
|
int rParen = rule.indexOf(')', lParen + 1);
|
||||||
|
if (rParen < 0) {
|
||||||
|
throw new IllegalArgumentException("Unmatched parentheses: " + rule);
|
||||||
|
}
|
||||||
|
|
||||||
|
parsedFlags.append(
|
||||||
|
dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
|
||||||
|
pos = rParen + 1;
|
||||||
|
if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
|
||||||
|
parsedFlags.append(rule.charAt(pos++));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data = parsedFlags.toString().toCharArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
|
||||||
|
return match(words, 0, 0, scratch, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
|
||||||
|
return match(words, 0, 0, scratch, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean match(
|
||||||
|
List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
|
||||||
|
if (patternIndex >= data.length) {
|
||||||
|
return wordIndex >= words.size();
|
||||||
|
}
|
||||||
|
if (wordIndex >= words.size() && !fully) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
char flag = data[patternIndex];
|
||||||
|
if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
|
||||||
|
int startWI = wordIndex;
|
||||||
|
while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
|
||||||
|
wordIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (wordIndex >= startWI) {
|
||||||
|
if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
wordIndex--;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean currentWordMatches =
|
||||||
|
wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
|
||||||
|
|
||||||
|
if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
|
||||||
|
if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return match(words, patternIndex + 2, wordIndex, scratch, fully);
|
||||||
|
}
|
||||||
|
|
||||||
|
return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return new String(data);
|
||||||
|
}
|
||||||
|
}
|
|
@ -92,6 +92,8 @@ public class Dictionary {
|
||||||
private static final String LANG_KEY = "LANG";
|
private static final String LANG_KEY = "LANG";
|
||||||
private static final String BREAK_KEY = "BREAK";
|
private static final String BREAK_KEY = "BREAK";
|
||||||
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
|
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
|
||||||
|
private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
|
||||||
|
private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
|
||||||
private static final String KEEPCASE_KEY = "KEEPCASE";
|
private static final String KEEPCASE_KEY = "KEEPCASE";
|
||||||
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
|
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
|
||||||
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
|
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
|
||||||
|
@ -136,7 +138,7 @@ public class Dictionary {
|
||||||
static final int AFFIX_APPEND = 3;
|
static final int AFFIX_APPEND = 3;
|
||||||
|
|
||||||
// Default flag parsing strategy
|
// Default flag parsing strategy
|
||||||
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
|
FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
|
||||||
|
|
||||||
// AF entries
|
// AF entries
|
||||||
private String[] aliases;
|
private String[] aliases;
|
||||||
|
@ -163,6 +165,8 @@ public class Dictionary {
|
||||||
int needaffix = -1; // needaffix flag, or -1 if one is not defined
|
int needaffix = -1; // needaffix flag, or -1 if one is not defined
|
||||||
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
|
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
|
||||||
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
|
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
|
||||||
|
int compoundMin = 3;
|
||||||
|
List<CompoundRule> compoundRules; // nullable
|
||||||
|
|
||||||
// ignored characters (dictionary, affix, inputs)
|
// ignored characters (dictionary, affix, inputs)
|
||||||
private char[] ignore;
|
private char[] ignore;
|
||||||
|
@ -419,6 +423,18 @@ public class Dictionary {
|
||||||
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
|
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
|
||||||
}
|
}
|
||||||
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
|
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
|
||||||
|
} else if (line.startsWith(COMPOUNDMIN_KEY)) {
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
if (parts.length != 2) {
|
||||||
|
throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
|
||||||
|
}
|
||||||
|
compoundMin = Math.max(1, Integer.parseInt(parts[1]));
|
||||||
|
} else if (line.startsWith(COMPOUNDRULE_KEY)) {
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
if (parts.length != 2) {
|
||||||
|
throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
|
||||||
|
}
|
||||||
|
this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -442,6 +458,21 @@ public class Dictionary {
|
||||||
stripOffsets[currentIndex] = currentOffset;
|
stripOffsets[currentIndex] = currentOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
|
||||||
|
throws IOException, ParseException {
|
||||||
|
String line;
|
||||||
|
List<CompoundRule> compoundRules = new ArrayList<>();
|
||||||
|
for (int i = 0; i < num; i++) {
|
||||||
|
line = reader.readLine();
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
|
||||||
|
throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
|
||||||
|
}
|
||||||
|
compoundRules.add(new CompoundRule(parts[1], this));
|
||||||
|
}
|
||||||
|
return compoundRules;
|
||||||
|
}
|
||||||
|
|
||||||
private Breaks parseBreaks(LineNumberReader reader, String line)
|
private Breaks parseBreaks(LineNumberReader reader, String line)
|
||||||
throws IOException, ParseException {
|
throws IOException, ParseException {
|
||||||
Set<String> starting = new LinkedHashSet<>();
|
Set<String> starting = new LinkedHashSet<>();
|
||||||
|
@ -910,7 +941,7 @@ public class Dictionary {
|
||||||
reuse.append(caseFold(word.charAt(i)));
|
reuse.append(caseFold(word.charAt(i)));
|
||||||
}
|
}
|
||||||
reuse.append(FLAG_SEPARATOR);
|
reuse.append(FLAG_SEPARATOR);
|
||||||
reuse.append(HIDDEN_FLAG);
|
flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
|
||||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||||
}
|
}
|
||||||
|
@ -1188,18 +1219,21 @@ public class Dictionary {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isForbiddenWord(char[] word, BytesRef scratch) {
|
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
|
||||||
if (forbiddenword != -1) {
|
if (forbiddenword != -1) {
|
||||||
IntsRef forms = lookupWord(word, 0, word.length);
|
IntsRef forms = lookupWord(word, 0, length);
|
||||||
if (forms != null) {
|
return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
|
||||||
int formStep = formStep();
|
int formStep = formStep();
|
||||||
for (int i = 0; i < forms.length; i += formStep) {
|
for (int i = 0; i < forms.length; i += formStep) {
|
||||||
if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
|
if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1227,6 +1261,8 @@ public class Dictionary {
|
||||||
* @return Parsed flags
|
* @return Parsed flags
|
||||||
*/
|
*/
|
||||||
abstract char[] parseFlags(String rawFlags);
|
abstract char[] parseFlags(String rawFlags);
|
||||||
|
|
||||||
|
abstract void appendFlag(char flag, StringBuilder to);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1238,6 +1274,11 @@ public class Dictionary {
|
||||||
public char[] parseFlags(String rawFlags) {
|
public char[] parseFlags(String rawFlags) {
|
||||||
return rawFlags.toCharArray();
|
return rawFlags.toCharArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void appendFlag(char flag, StringBuilder to) {
|
||||||
|
to.append(flag);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1266,6 +1307,14 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void appendFlag(char flag, StringBuilder to) {
|
||||||
|
if (to.length() > 0) {
|
||||||
|
to.append(",");
|
||||||
|
}
|
||||||
|
to.append((int) flag);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1300,6 +1349,16 @@ public class Dictionary {
|
||||||
builder.getChars(0, builder.length(), flags, 0);
|
builder.getChars(0, builder.length(), flags, 0);
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void appendFlag(char flag, StringBuilder to) {
|
||||||
|
to.append((char) (flag >> 8));
|
||||||
|
to.append((char) (flag & 0xff));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasCompounding() {
|
||||||
|
return compoundRules != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
|
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
|
||||||
|
|
|
@ -16,7 +16,10 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
|
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
|
||||||
|
@ -37,26 +40,100 @@ public class SpellChecker {
|
||||||
public boolean spell(String word) {
|
public boolean spell(String word) {
|
||||||
if (word.isEmpty()) return true;
|
if (word.isEmpty()) return true;
|
||||||
|
|
||||||
char[] wordChars = word.toCharArray();
|
if (dictionary.needsInputCleaning) {
|
||||||
if (dictionary.isForbiddenWord(wordChars, scratch)) {
|
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isNumber(word)) {
|
if (isNumber(word)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
|
char[] wordChars = word.toCharArray();
|
||||||
|
if (checkWord(wordChars, wordChars.length, false)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
|
WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
|
||||||
|
if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dictionary.breaks.isNotEmpty()
|
||||||
|
&& !hasTooManyBreakOccurrences(word)
|
||||||
|
&& !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
|
||||||
return tryBreaks(word);
|
return tryBreaks(word);
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
|
||||||
|
char[] caseVariant = wordChars;
|
||||||
|
if (wordCase == WordCase.UPPER) {
|
||||||
|
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
|
||||||
|
if (checkWord(caseVariant, wordChars.length, true)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
|
||||||
|
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dictionary.hasCompounding()) {
|
||||||
|
return checkCompounds(wordChars, 0, length, new ArrayList<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
|
||||||
|
if (words.size() >= 100) return false;
|
||||||
|
|
||||||
|
int limit = length - dictionary.compoundMin + 1;
|
||||||
|
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
||||||
|
IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
|
||||||
|
if (forms != null) {
|
||||||
|
words.add(forms);
|
||||||
|
|
||||||
|
if (dictionary.compoundRules != null
|
||||||
|
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
|
||||||
|
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
words.remove(words.size() - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkLastCompoundPart(
|
||||||
|
char[] wordChars, int start, int length, List<IntsRef> words) {
|
||||||
|
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
|
||||||
|
if (forms == null) return false;
|
||||||
|
|
||||||
|
words.add(forms);
|
||||||
|
boolean result =
|
||||||
|
dictionary.compoundRules != null
|
||||||
|
&& dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
|
||||||
|
words.remove(words.size() - 1);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
private static boolean isNumber(String s) {
|
private static boolean isNumber(String s) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while (i < s.length()) {
|
while (i < s.length()) {
|
||||||
|
|
|
@ -112,8 +112,8 @@ final class Stemmer {
|
||||||
private char[] titleBuffer = new char[8];
|
private char[] titleBuffer = new char[8];
|
||||||
|
|
||||||
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
||||||
private WordCase caseOf(char[] word, int length) {
|
WordCase caseOf(char[] word, int length) {
|
||||||
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
|
if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
|
||||||
return WordCase.MIXED;
|
return WordCase.MIXED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,22 +121,24 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** folds titlecase variant of word to titleBuffer */
|
/** folds titlecase variant of word to titleBuffer */
|
||||||
private void caseFoldTitle(char[] word, int length) {
|
char[] caseFoldTitle(char[] word, int length) {
|
||||||
titleBuffer = ArrayUtil.grow(titleBuffer, length);
|
titleBuffer = ArrayUtil.grow(titleBuffer, length);
|
||||||
System.arraycopy(word, 0, titleBuffer, 0, length);
|
System.arraycopy(word, 0, titleBuffer, 0, length);
|
||||||
for (int i = 1; i < length; i++) {
|
for (int i = 1; i < length; i++) {
|
||||||
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
|
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
|
||||||
}
|
}
|
||||||
|
return titleBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
||||||
private void caseFoldLower(char[] word, int length) {
|
char[] caseFoldLower(char[] word, int length) {
|
||||||
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
|
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
|
||||||
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
||||||
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
||||||
|
return lowerBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
|
List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
|
||||||
List<CharsRef> stems = new ArrayList<>();
|
List<CharsRef> stems = new ArrayList<>();
|
||||||
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
||||||
if (forms != null) {
|
if (forms != null) {
|
||||||
|
|
|
@ -23,7 +23,7 @@ enum WordCase {
|
||||||
MIXED;
|
MIXED;
|
||||||
|
|
||||||
static WordCase caseOf(char[] word, int length) {
|
static WordCase caseOf(char[] word, int length) {
|
||||||
boolean capitalized = Character.isUpperCase(word[0]);
|
boolean startsWithLower = Character.isLowerCase(word[0]);
|
||||||
|
|
||||||
boolean seenUpper = false;
|
boolean seenUpper = false;
|
||||||
boolean seenLower = false;
|
boolean seenLower = false;
|
||||||
|
@ -34,11 +34,11 @@ enum WordCase {
|
||||||
if (seenUpper && seenLower) break;
|
if (seenUpper && seenLower) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return get(capitalized, seenUpper, seenLower);
|
return get(startsWithLower, seenUpper, seenLower);
|
||||||
}
|
}
|
||||||
|
|
||||||
static WordCase caseOf(CharSequence word, int length) {
|
static WordCase caseOf(CharSequence word, int length) {
|
||||||
boolean capitalized = Character.isUpperCase(word.charAt(0));
|
boolean startsWithLower = Character.isLowerCase(word.charAt(0));
|
||||||
|
|
||||||
boolean seenUpper = false;
|
boolean seenUpper = false;
|
||||||
boolean seenLower = false;
|
boolean seenLower = false;
|
||||||
|
@ -49,11 +49,11 @@ enum WordCase {
|
||||||
if (seenUpper && seenLower) break;
|
if (seenUpper && seenLower) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return get(capitalized, seenUpper, seenLower);
|
return get(startsWithLower, seenUpper, seenLower);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
|
private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
|
||||||
if (capitalized) {
|
if (!startsWithLower) {
|
||||||
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
|
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
|
||||||
}
|
}
|
||||||
return seenUpper ? MIXED : LOWER;
|
return seenUpper ? MIXED : LOWER;
|
||||||
|
|
|
@ -43,6 +43,38 @@ public class SpellCheckerTest extends StemmerTestBase {
|
||||||
doTest("breakoff");
|
doTest("breakoff");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule() throws Exception {
|
||||||
|
doTest("compoundrule");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule2() throws Exception {
|
||||||
|
doTest("compoundrule2");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule3() throws Exception {
|
||||||
|
doTest("compoundrule3");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule4() throws Exception {
|
||||||
|
doTest("compoundrule4");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule5() throws Exception {
|
||||||
|
doTest("compoundrule5");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule6() throws Exception {
|
||||||
|
doTest("compoundrule6");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule7() throws Exception {
|
||||||
|
doTest("compoundrule7");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompoundrule8() throws Exception {
|
||||||
|
doTest("compoundrule8");
|
||||||
|
}
|
||||||
|
|
||||||
protected void doTest(String name) throws Exception {
|
protected void doTest(String name) throws Exception {
|
||||||
InputStream affixStream =
|
InputStream affixStream =
|
||||||
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
|
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
|
import java.util.Random;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
@ -33,6 +34,7 @@ import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FSTCompiler;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.Outputs;
|
import org.apache.lucene.util.fst.Outputs;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestDictionary extends LuceneTestCase {
|
public class TestDictionary extends LuceneTestCase {
|
||||||
|
|
||||||
|
@ -268,6 +270,27 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8"));
|
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFlagSerialization() {
|
||||||
|
Random r = random();
|
||||||
|
char[] flags = new char[r.nextInt(10)];
|
||||||
|
for (int i = 0; i < flags.length; i++) {
|
||||||
|
flags[i] = (char) r.nextInt(Character.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"};
|
||||||
|
for (String flagLine : flagLines) {
|
||||||
|
Dictionary.FlagParsingStrategy strategy = Dictionary.getFlagParsingStrategy(flagLine);
|
||||||
|
StringBuilder serialized = new StringBuilder();
|
||||||
|
for (char flag : flags) {
|
||||||
|
strategy.appendFlag(flag, serialized);
|
||||||
|
}
|
||||||
|
|
||||||
|
char[] deserialized = strategy.parseFlags(serialized.toString());
|
||||||
|
assertEquals(new String(flags), new String(deserialized));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private Directory getDirectory() {
|
private Directory getDirectory() {
|
||||||
return newDirectory();
|
return newDirectory();
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
COMPOUNDRULE 1
|
||||||
|
COMPOUNDRULE ABC
|
|
@ -0,0 +1,5 @@
|
||||||
|
3
|
||||||
|
a/A
|
||||||
|
b/B
|
||||||
|
c/BC
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
abc
|
||||||
|
acc
|
|
@ -0,0 +1,39 @@
|
||||||
|
ba
|
||||||
|
aaabaaa
|
||||||
|
bbaaa
|
||||||
|
aaaaba
|
||||||
|
bbbbbaa
|
||||||
|
aa
|
||||||
|
aaa
|
||||||
|
aaaa
|
||||||
|
ab
|
||||||
|
aab
|
||||||
|
aaab
|
||||||
|
aaaab
|
||||||
|
abb
|
||||||
|
aabb
|
||||||
|
aaabbb
|
||||||
|
bb
|
||||||
|
bbb
|
||||||
|
bbbb
|
||||||
|
aaab
|
||||||
|
abcc
|
||||||
|
abbc
|
||||||
|
abbcc
|
||||||
|
aabc
|
||||||
|
aabcc
|
||||||
|
aabbc
|
||||||
|
aabbcc
|
||||||
|
aaabbbccc
|
||||||
|
ac
|
||||||
|
aac
|
||||||
|
aacc
|
||||||
|
aaaccc
|
||||||
|
bc
|
||||||
|
bcc
|
||||||
|
bbc
|
||||||
|
bbcc
|
||||||
|
bbbccc
|
||||||
|
cc
|
||||||
|
ccc
|
||||||
|
cccccc
|
|
@ -0,0 +1,3 @@
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
COMPOUNDRULE 1
|
||||||
|
COMPOUNDRULE A*B*C*
|
|
@ -0,0 +1,5 @@
|
||||||
|
3
|
||||||
|
a/A
|
||||||
|
b/B
|
||||||
|
c/C
|
||||||
|
|
|
@ -0,0 +1,37 @@
|
||||||
|
aa
|
||||||
|
aaa
|
||||||
|
aaaa
|
||||||
|
ab
|
||||||
|
aab
|
||||||
|
aaab
|
||||||
|
aaaab
|
||||||
|
abb
|
||||||
|
aabb
|
||||||
|
aaabbb
|
||||||
|
bb
|
||||||
|
bbb
|
||||||
|
bbbb
|
||||||
|
aaab
|
||||||
|
abc
|
||||||
|
abcc
|
||||||
|
abbc
|
||||||
|
abbcc
|
||||||
|
aabc
|
||||||
|
aabcc
|
||||||
|
aabbc
|
||||||
|
aabbcc
|
||||||
|
aaabbbccc
|
||||||
|
ac
|
||||||
|
acc
|
||||||
|
aac
|
||||||
|
aacc
|
||||||
|
aaaccc
|
||||||
|
bc
|
||||||
|
bcc
|
||||||
|
bbc
|
||||||
|
bbcc
|
||||||
|
bbbccc
|
||||||
|
cc
|
||||||
|
ccc
|
||||||
|
cccccc
|
||||||
|
abcc
|
|
@ -0,0 +1,8 @@
|
||||||
|
ba
|
||||||
|
aaabaaa
|
||||||
|
bbaaa
|
||||||
|
aaaaba
|
||||||
|
bbbbbaa
|
||||||
|
cba
|
||||||
|
cab
|
||||||
|
acb
|
|
@ -0,0 +1,3 @@
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
COMPOUNDRULE 1
|
||||||
|
COMPOUNDRULE A?B?C?
|
|
@ -0,0 +1,5 @@
|
||||||
|
3
|
||||||
|
a/A
|
||||||
|
b/B
|
||||||
|
c/C
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
a
|
||||||
|
b
|
||||||
|
c
|
||||||
|
ab
|
||||||
|
abc
|
||||||
|
ac
|
||||||
|
bc
|
|
@ -0,0 +1,41 @@
|
||||||
|
aa
|
||||||
|
aaa
|
||||||
|
aaaa
|
||||||
|
aab
|
||||||
|
aaab
|
||||||
|
aaaab
|
||||||
|
abb
|
||||||
|
aabb
|
||||||
|
aaabbb
|
||||||
|
bb
|
||||||
|
bbb
|
||||||
|
bbbb
|
||||||
|
aaab
|
||||||
|
abcc
|
||||||
|
abbc
|
||||||
|
abbcc
|
||||||
|
aabc
|
||||||
|
aabcc
|
||||||
|
aabbc
|
||||||
|
aabbcc
|
||||||
|
aaabbbccc
|
||||||
|
acc
|
||||||
|
aac
|
||||||
|
aacc
|
||||||
|
aaaccc
|
||||||
|
bcc
|
||||||
|
bbc
|
||||||
|
bbcc
|
||||||
|
bbbccc
|
||||||
|
cc
|
||||||
|
ccc
|
||||||
|
cccccc
|
||||||
|
abcc
|
||||||
|
ba
|
||||||
|
aaabaaa
|
||||||
|
bbaaa
|
||||||
|
aaaaba
|
||||||
|
bbbbbaa
|
||||||
|
cba
|
||||||
|
cab
|
||||||
|
acb
|
|
@ -0,0 +1,7 @@
|
||||||
|
# English ordinal numbers
|
||||||
|
WORDCHARS 0123456789
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
ONLYINCOMPOUND c
|
||||||
|
COMPOUNDRULE 2
|
||||||
|
COMPOUNDRULE n*1t
|
||||||
|
COMPOUNDRULE n*mp
|
|
@ -0,0 +1,24 @@
|
||||||
|
22
|
||||||
|
0/nm
|
||||||
|
1/n1
|
||||||
|
2/nm
|
||||||
|
3/nm
|
||||||
|
4/nm
|
||||||
|
5/nm
|
||||||
|
6/nm
|
||||||
|
7/nm
|
||||||
|
8/nm
|
||||||
|
9/nm
|
||||||
|
0th/pt
|
||||||
|
1st/p
|
||||||
|
1th/tc
|
||||||
|
2nd/p
|
||||||
|
2th/tc
|
||||||
|
3rd/p
|
||||||
|
3th/tc
|
||||||
|
4th/pt
|
||||||
|
5th/pt
|
||||||
|
6th/pt
|
||||||
|
7th/pt
|
||||||
|
8th/pt
|
||||||
|
9th/pt
|
|
@ -0,0 +1,31 @@
|
||||||
|
1st
|
||||||
|
2nd
|
||||||
|
3rd
|
||||||
|
4th
|
||||||
|
5th
|
||||||
|
6th
|
||||||
|
7th
|
||||||
|
8th
|
||||||
|
9th
|
||||||
|
10th
|
||||||
|
11th
|
||||||
|
12th
|
||||||
|
13th
|
||||||
|
14th
|
||||||
|
15th
|
||||||
|
16th
|
||||||
|
17th
|
||||||
|
18th
|
||||||
|
19th
|
||||||
|
20th
|
||||||
|
21st
|
||||||
|
22nd
|
||||||
|
23rd
|
||||||
|
24th
|
||||||
|
25th
|
||||||
|
100th
|
||||||
|
1000th
|
||||||
|
10001st
|
||||||
|
10011th
|
||||||
|
1ST
|
||||||
|
42ND
|
|
@ -0,0 +1,5 @@
|
||||||
|
1th
|
||||||
|
2th
|
||||||
|
3th
|
||||||
|
10001th
|
||||||
|
10011st
|
|
@ -0,0 +1,7 @@
|
||||||
|
# number + percent
|
||||||
|
SET UTF-8
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
COMPOUNDRULE 2
|
||||||
|
COMPOUNDRULE N*%?
|
||||||
|
COMPOUNDRULE NN*.NN*%?
|
||||||
|
WORDCHARS 0123456789‰.
|
|
@ -0,0 +1,14 @@
|
||||||
|
13
|
||||||
|
0/N po:num
|
||||||
|
1/N po:num
|
||||||
|
2/N po:num
|
||||||
|
3/N po:num
|
||||||
|
4/N po:num
|
||||||
|
5/N po:num
|
||||||
|
6/N po:num
|
||||||
|
7/N po:num
|
||||||
|
8/N po:num
|
||||||
|
9/N po:num
|
||||||
|
./. po:sign_dot
|
||||||
|
%/% po:sign_percent
|
||||||
|
‰/% po:sign_per_mille
|
|
@ -0,0 +1,7 @@
|
||||||
|
10%
|
||||||
|
0.2%
|
||||||
|
0.20%
|
||||||
|
123.4561‰
|
||||||
|
10
|
||||||
|
0000
|
||||||
|
10.25
|
|
@ -0,0 +1 @@
|
||||||
|
.25
|
|
@ -0,0 +1,4 @@
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
COMPOUNDRULE 2
|
||||||
|
COMPOUNDRULE A*A
|
||||||
|
COMPOUNDRULE A*AAB*BBBC*C
|
|
@ -0,0 +1,5 @@
|
||||||
|
3
|
||||||
|
a/A
|
||||||
|
b/B
|
||||||
|
c/C
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
aa
|
||||||
|
aaaaaa
|
||||||
|
aabbbc
|
||||||
|
aaaaabbbbbbcccccc
|
|
@ -0,0 +1,4 @@
|
||||||
|
abc
|
||||||
|
abbbbbccccccc
|
||||||
|
aabbccccccc
|
||||||
|
aabbbbbbb
|
|
@ -0,0 +1,8 @@
|
||||||
|
# English ordinal numbers (parenthesized long flags)
|
||||||
|
FLAG long
|
||||||
|
WORDCHARS 0123456789
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
ONLYINCOMPOUND cc
|
||||||
|
COMPOUNDRULE 2
|
||||||
|
COMPOUNDRULE (nn)*(11)(tt)
|
||||||
|
COMPOUNDRULE (nn)*(mm)(pp)
|
|
@ -0,0 +1,24 @@
|
||||||
|
22
|
||||||
|
0/nnmm
|
||||||
|
1/nn11
|
||||||
|
2/nnmm
|
||||||
|
3/nnmm
|
||||||
|
4/nnmm
|
||||||
|
5/nnmm
|
||||||
|
6/nnmm
|
||||||
|
7/nnmm
|
||||||
|
8/nnmm
|
||||||
|
9/nnmm
|
||||||
|
0th/pptt
|
||||||
|
1st/pp
|
||||||
|
1th/ttcc
|
||||||
|
2nd/pp
|
||||||
|
2th/ttcc
|
||||||
|
3rd/pp
|
||||||
|
3th/ttcc
|
||||||
|
4th/pptt
|
||||||
|
5th/pptt
|
||||||
|
6th/pptt
|
||||||
|
7th/pptt
|
||||||
|
8th/pptt
|
||||||
|
9th/pptt
|
|
@ -0,0 +1,29 @@
|
||||||
|
1st
|
||||||
|
2nd
|
||||||
|
3rd
|
||||||
|
4th
|
||||||
|
5th
|
||||||
|
6th
|
||||||
|
7th
|
||||||
|
8th
|
||||||
|
9th
|
||||||
|
10th
|
||||||
|
11th
|
||||||
|
12th
|
||||||
|
13th
|
||||||
|
14th
|
||||||
|
15th
|
||||||
|
16th
|
||||||
|
17th
|
||||||
|
18th
|
||||||
|
19th
|
||||||
|
20th
|
||||||
|
21st
|
||||||
|
22nd
|
||||||
|
23rd
|
||||||
|
24th
|
||||||
|
25th
|
||||||
|
100th
|
||||||
|
1000th
|
||||||
|
10001st
|
||||||
|
10011th
|
|
@ -0,0 +1,5 @@
|
||||||
|
1th
|
||||||
|
2th
|
||||||
|
3th
|
||||||
|
10001th
|
||||||
|
10011st
|
|
@ -0,0 +1,8 @@
|
||||||
|
# English ordinal numbers (parenthesized numerical flags)
|
||||||
|
FLAG num
|
||||||
|
WORDCHARS 0123456789
|
||||||
|
COMPOUNDMIN 1
|
||||||
|
ONLYINCOMPOUND 1000
|
||||||
|
COMPOUNDRULE 2
|
||||||
|
COMPOUNDRULE (1001)*(1002)(2001)
|
||||||
|
COMPOUNDRULE (1001)*(2002)(2000)
|
|
@ -0,0 +1,24 @@
|
||||||
|
22
|
||||||
|
0/1001,2002
|
||||||
|
1/1001,1002
|
||||||
|
2/1001,2002
|
||||||
|
3/1001,2002
|
||||||
|
4/1001,2002
|
||||||
|
5/1001,2002
|
||||||
|
6/1001,2002
|
||||||
|
7/1001,2002
|
||||||
|
8/1001,2002
|
||||||
|
9/1001,2002
|
||||||
|
0th/2000,2001
|
||||||
|
1st/2000
|
||||||
|
1th/2001,1000
|
||||||
|
2nd/2000
|
||||||
|
2th/2001,1000
|
||||||
|
3rd/2000
|
||||||
|
3th/2001,1000
|
||||||
|
4th/2000,2001
|
||||||
|
5th/2000,2001
|
||||||
|
6th/2000,2001
|
||||||
|
7th/2000,2001
|
||||||
|
8th/2000,2001
|
||||||
|
9th/2000,2001
|
|
@ -0,0 +1,29 @@
|
||||||
|
1st
|
||||||
|
2nd
|
||||||
|
3rd
|
||||||
|
4th
|
||||||
|
5th
|
||||||
|
6th
|
||||||
|
7th
|
||||||
|
8th
|
||||||
|
9th
|
||||||
|
10th
|
||||||
|
11th
|
||||||
|
12th
|
||||||
|
13th
|
||||||
|
14th
|
||||||
|
15th
|
||||||
|
16th
|
||||||
|
17th
|
||||||
|
18th
|
||||||
|
19th
|
||||||
|
20th
|
||||||
|
21st
|
||||||
|
22nd
|
||||||
|
23rd
|
||||||
|
24th
|
||||||
|
25th
|
||||||
|
100th
|
||||||
|
1000th
|
||||||
|
10001st
|
||||||
|
10011th
|
|
@ -0,0 +1,5 @@
|
||||||
|
1th
|
||||||
|
2th
|
||||||
|
3th
|
||||||
|
10001th
|
||||||
|
10011st
|
Loading…
Reference in New Issue