mirror of https://github.com/apache/lucene.git
LUCENE-9684: Hunspell: support COMPOUNDRULE (#2228)
This commit is contained in:
parent
cf5db8d651
commit
d7968130c3
|
@ -86,8 +86,8 @@ API Changes
|
|||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
|
||||
BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
|
||||
* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
|
||||
BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov)
|
||||
|
||||
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||
(Dawid Weiss)
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
class CompoundRule {
|
||||
private final char[] data;
|
||||
private final Dictionary dictionary;
|
||||
|
||||
CompoundRule(String rule, Dictionary dictionary) {
|
||||
this.dictionary = dictionary;
|
||||
StringBuilder parsedFlags = new StringBuilder();
|
||||
int pos = 0;
|
||||
while (pos < rule.length()) {
|
||||
int lParen = rule.indexOf("(", pos);
|
||||
if (lParen < 0) {
|
||||
parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos)));
|
||||
break;
|
||||
}
|
||||
|
||||
parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen)));
|
||||
int rParen = rule.indexOf(')', lParen + 1);
|
||||
if (rParen < 0) {
|
||||
throw new IllegalArgumentException("Unmatched parentheses: " + rule);
|
||||
}
|
||||
|
||||
parsedFlags.append(
|
||||
dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen)));
|
||||
pos = rParen + 1;
|
||||
if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) {
|
||||
parsedFlags.append(rule.charAt(pos++));
|
||||
}
|
||||
}
|
||||
data = parsedFlags.toString().toCharArray();
|
||||
}
|
||||
|
||||
boolean mayMatch(List<IntsRef> words, BytesRef scratch) {
|
||||
return match(words, 0, 0, scratch, false);
|
||||
}
|
||||
|
||||
boolean fullyMatches(List<IntsRef> words, BytesRef scratch) {
|
||||
return match(words, 0, 0, scratch, true);
|
||||
}
|
||||
|
||||
private boolean match(
|
||||
List<IntsRef> words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) {
|
||||
if (patternIndex >= data.length) {
|
||||
return wordIndex >= words.size();
|
||||
}
|
||||
if (wordIndex >= words.size() && !fully) {
|
||||
return true;
|
||||
}
|
||||
|
||||
char flag = data[patternIndex];
|
||||
if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') {
|
||||
int startWI = wordIndex;
|
||||
while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) {
|
||||
wordIndex++;
|
||||
}
|
||||
|
||||
while (wordIndex >= startWI) {
|
||||
if (match(words, patternIndex + 2, wordIndex, scratch, fully)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
wordIndex--;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
boolean currentWordMatches =
|
||||
wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch);
|
||||
|
||||
if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') {
|
||||
if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) {
|
||||
return true;
|
||||
}
|
||||
return match(words, patternIndex + 2, wordIndex, scratch, fully);
|
||||
}
|
||||
|
||||
return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new String(data);
|
||||
}
|
||||
}
|
|
@ -92,6 +92,8 @@ public class Dictionary {
|
|||
private static final String LANG_KEY = "LANG";
|
||||
private static final String BREAK_KEY = "BREAK";
|
||||
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
|
||||
private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
|
||||
private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
|
||||
private static final String KEEPCASE_KEY = "KEEPCASE";
|
||||
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
|
||||
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
|
||||
|
@ -136,7 +138,7 @@ public class Dictionary {
|
|||
static final int AFFIX_APPEND = 3;
|
||||
|
||||
// Default flag parsing strategy
|
||||
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
|
||||
FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy();
|
||||
|
||||
// AF entries
|
||||
private String[] aliases;
|
||||
|
@ -163,6 +165,8 @@ public class Dictionary {
|
|||
int needaffix = -1; // needaffix flag, or -1 if one is not defined
|
||||
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
|
||||
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
|
||||
int compoundMin = 3;
|
||||
List<CompoundRule> compoundRules; // nullable
|
||||
|
||||
// ignored characters (dictionary, affix, inputs)
|
||||
private char[] ignore;
|
||||
|
@ -419,6 +423,18 @@ public class Dictionary {
|
|||
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
|
||||
}
|
||||
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(COMPOUNDMIN_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
|
||||
}
|
||||
compoundMin = Math.max(1, Integer.parseInt(parts[1]));
|
||||
} else if (line.startsWith(COMPOUNDRULE_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
|
||||
}
|
||||
this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -442,6 +458,21 @@ public class Dictionary {
|
|||
stripOffsets[currentIndex] = currentOffset;
|
||||
}
|
||||
|
||||
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
|
||||
throws IOException, ParseException {
|
||||
String line;
|
||||
List<CompoundRule> compoundRules = new ArrayList<>();
|
||||
for (int i = 0; i < num; i++) {
|
||||
line = reader.readLine();
|
||||
String[] parts = line.split("\\s+");
|
||||
if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
|
||||
throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
|
||||
}
|
||||
compoundRules.add(new CompoundRule(parts[1], this));
|
||||
}
|
||||
return compoundRules;
|
||||
}
|
||||
|
||||
private Breaks parseBreaks(LineNumberReader reader, String line)
|
||||
throws IOException, ParseException {
|
||||
Set<String> starting = new LinkedHashSet<>();
|
||||
|
@ -910,7 +941,7 @@ public class Dictionary {
|
|||
reuse.append(caseFold(word.charAt(i)));
|
||||
}
|
||||
reuse.append(FLAG_SEPARATOR);
|
||||
reuse.append(HIDDEN_FLAG);
|
||||
flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
|
||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
@ -1188,16 +1219,19 @@ public class Dictionary {
|
|||
return null;
|
||||
}
|
||||
|
||||
boolean isForbiddenWord(char[] word, BytesRef scratch) {
|
||||
boolean isForbiddenWord(char[] word, int length, BytesRef scratch) {
|
||||
if (forbiddenword != -1) {
|
||||
IntsRef forms = lookupWord(word, 0, word.length);
|
||||
if (forms != null) {
|
||||
int formStep = formStep();
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
IntsRef forms = lookupWord(word, 0, length);
|
||||
return forms != null && hasFlag(forms, (char) forbiddenword, scratch);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) {
|
||||
int formStep = formStep();
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
@ -1227,6 +1261,8 @@ public class Dictionary {
|
|||
* @return Parsed flags
|
||||
*/
|
||||
abstract char[] parseFlags(String rawFlags);
|
||||
|
||||
abstract void appendFlag(char flag, StringBuilder to);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1238,6 +1274,11 @@ public class Dictionary {
|
|||
public char[] parseFlags(String rawFlags) {
|
||||
return rawFlags.toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
void appendFlag(char flag, StringBuilder to) {
|
||||
to.append(flag);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1266,6 +1307,14 @@ public class Dictionary {
|
|||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
void appendFlag(char flag, StringBuilder to) {
|
||||
if (to.length() > 0) {
|
||||
to.append(",");
|
||||
}
|
||||
to.append((int) flag);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1300,6 +1349,16 @@ public class Dictionary {
|
|||
builder.getChars(0, builder.length(), flags, 0);
|
||||
return flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
void appendFlag(char flag, StringBuilder to) {
|
||||
to.append((char) (flag >> 8));
|
||||
to.append((char) (flag & 0xff));
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasCompounding() {
|
||||
return compoundRules != null;
|
||||
}
|
||||
|
||||
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
|
||||
|
|
|
@ -16,7 +16,10 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/**
|
||||
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
|
||||
|
@ -37,26 +40,100 @@ public class SpellChecker {
|
|||
public boolean spell(String word) {
|
||||
if (word.isEmpty()) return true;
|
||||
|
||||
char[] wordChars = word.toCharArray();
|
||||
if (dictionary.isForbiddenWord(wordChars, scratch)) {
|
||||
return false;
|
||||
if (dictionary.needsInputCleaning) {
|
||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
if (isNumber(word)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
|
||||
char[] wordChars = word.toCharArray();
|
||||
if (checkWord(wordChars, wordChars.length, false)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
|
||||
WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
|
||||
if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dictionary.breaks.isNotEmpty()
|
||||
&& !hasTooManyBreakOccurrences(word)
|
||||
&& !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) {
|
||||
return tryBreaks(word);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
|
||||
char[] caseVariant = wordChars;
|
||||
if (wordCase == WordCase.UPPER) {
|
||||
caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
|
||||
if (checkWord(caseVariant, wordChars.length, true)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
|
||||
}
|
||||
|
||||
private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
|
||||
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dictionary.hasCompounding()) {
|
||||
return checkCompounds(wordChars, 0, length, new ArrayList<>());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
|
||||
if (words.size() >= 100) return false;
|
||||
|
||||
int limit = length - dictionary.compoundMin + 1;
|
||||
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
||||
IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos);
|
||||
if (forms != null) {
|
||||
words.add(forms);
|
||||
|
||||
if (dictionary.compoundRules != null
|
||||
&& dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) {
|
||||
if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
words.remove(words.size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkLastCompoundPart(
|
||||
char[] wordChars, int start, int length, List<IntsRef> words) {
|
||||
IntsRef forms = dictionary.lookupWord(wordChars, start, length);
|
||||
if (forms == null) return false;
|
||||
|
||||
words.add(forms);
|
||||
boolean result =
|
||||
dictionary.compoundRules != null
|
||||
&& dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
|
||||
words.remove(words.size() - 1);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static boolean isNumber(String s) {
|
||||
int i = 0;
|
||||
while (i < s.length()) {
|
||||
|
|
|
@ -112,8 +112,8 @@ final class Stemmer {
|
|||
private char[] titleBuffer = new char[8];
|
||||
|
||||
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
||||
private WordCase caseOf(char[] word, int length) {
|
||||
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
|
||||
WordCase caseOf(char[] word, int length) {
|
||||
if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
|
||||
return WordCase.MIXED;
|
||||
}
|
||||
|
||||
|
@ -121,22 +121,24 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
/** folds titlecase variant of word to titleBuffer */
|
||||
private void caseFoldTitle(char[] word, int length) {
|
||||
char[] caseFoldTitle(char[] word, int length) {
|
||||
titleBuffer = ArrayUtil.grow(titleBuffer, length);
|
||||
System.arraycopy(word, 0, titleBuffer, 0, length);
|
||||
for (int i = 1; i < length; i++) {
|
||||
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
|
||||
}
|
||||
return titleBuffer;
|
||||
}
|
||||
|
||||
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
||||
private void caseFoldLower(char[] word, int length) {
|
||||
char[] caseFoldLower(char[] word, int length) {
|
||||
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
|
||||
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
||||
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
||||
return lowerBuffer;
|
||||
}
|
||||
|
||||
private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
|
||||
List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
||||
if (forms != null) {
|
||||
|
|
|
@ -23,7 +23,7 @@ enum WordCase {
|
|||
MIXED;
|
||||
|
||||
static WordCase caseOf(char[] word, int length) {
|
||||
boolean capitalized = Character.isUpperCase(word[0]);
|
||||
boolean startsWithLower = Character.isLowerCase(word[0]);
|
||||
|
||||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
|
@ -34,11 +34,11 @@ enum WordCase {
|
|||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
return get(capitalized, seenUpper, seenLower);
|
||||
return get(startsWithLower, seenUpper, seenLower);
|
||||
}
|
||||
|
||||
static WordCase caseOf(CharSequence word, int length) {
|
||||
boolean capitalized = Character.isUpperCase(word.charAt(0));
|
||||
boolean startsWithLower = Character.isLowerCase(word.charAt(0));
|
||||
|
||||
boolean seenUpper = false;
|
||||
boolean seenLower = false;
|
||||
|
@ -49,11 +49,11 @@ enum WordCase {
|
|||
if (seenUpper && seenLower) break;
|
||||
}
|
||||
|
||||
return get(capitalized, seenUpper, seenLower);
|
||||
return get(startsWithLower, seenUpper, seenLower);
|
||||
}
|
||||
|
||||
private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) {
|
||||
if (capitalized) {
|
||||
private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) {
|
||||
if (!startsWithLower) {
|
||||
return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED;
|
||||
}
|
||||
return seenUpper ? MIXED : LOWER;
|
||||
|
|
|
@ -43,6 +43,38 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("breakoff");
|
||||
}
|
||||
|
||||
public void testCompoundrule() throws Exception {
|
||||
doTest("compoundrule");
|
||||
}
|
||||
|
||||
public void testCompoundrule2() throws Exception {
|
||||
doTest("compoundrule2");
|
||||
}
|
||||
|
||||
public void testCompoundrule3() throws Exception {
|
||||
doTest("compoundrule3");
|
||||
}
|
||||
|
||||
public void testCompoundrule4() throws Exception {
|
||||
doTest("compoundrule4");
|
||||
}
|
||||
|
||||
public void testCompoundrule5() throws Exception {
|
||||
doTest("compoundrule5");
|
||||
}
|
||||
|
||||
public void testCompoundrule6() throws Exception {
|
||||
doTest("compoundrule6");
|
||||
}
|
||||
|
||||
public void testCompoundrule7() throws Exception {
|
||||
doTest("compoundrule7");
|
||||
}
|
||||
|
||||
public void testCompoundrule8() throws Exception {
|
||||
doTest("compoundrule8");
|
||||
}
|
||||
|
||||
protected void doTest(String name) throws Exception {
|
||||
InputStream affixStream =
|
||||
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
@ -33,6 +34,7 @@ import org.apache.lucene.util.fst.FST;
|
|||
import org.apache.lucene.util.fst.FSTCompiler;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestDictionary extends LuceneTestCase {
|
||||
|
||||
|
@ -268,6 +270,27 @@ public class TestDictionary extends LuceneTestCase {
|
|||
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFlagSerialization() {
|
||||
Random r = random();
|
||||
char[] flags = new char[r.nextInt(10)];
|
||||
for (int i = 0; i < flags.length; i++) {
|
||||
flags[i] = (char) r.nextInt(Character.MAX_VALUE);
|
||||
}
|
||||
|
||||
String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"};
|
||||
for (String flagLine : flagLines) {
|
||||
Dictionary.FlagParsingStrategy strategy = Dictionary.getFlagParsingStrategy(flagLine);
|
||||
StringBuilder serialized = new StringBuilder();
|
||||
for (char flag : flags) {
|
||||
strategy.appendFlag(flag, serialized);
|
||||
}
|
||||
|
||||
char[] deserialized = strategy.parseFlags(serialized.toString());
|
||||
assertEquals(new String(flags), new String(deserialized));
|
||||
}
|
||||
}
|
||||
|
||||
private Directory getDirectory() {
|
||||
return newDirectory();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
COMPOUNDMIN 1
|
||||
COMPOUNDRULE 1
|
||||
COMPOUNDRULE ABC
|
|
@ -0,0 +1,5 @@
|
|||
3
|
||||
a/A
|
||||
b/B
|
||||
c/BC
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
abc
|
||||
acc
|
|
@ -0,0 +1,39 @@
|
|||
ba
|
||||
aaabaaa
|
||||
bbaaa
|
||||
aaaaba
|
||||
bbbbbaa
|
||||
aa
|
||||
aaa
|
||||
aaaa
|
||||
ab
|
||||
aab
|
||||
aaab
|
||||
aaaab
|
||||
abb
|
||||
aabb
|
||||
aaabbb
|
||||
bb
|
||||
bbb
|
||||
bbbb
|
||||
aaab
|
||||
abcc
|
||||
abbc
|
||||
abbcc
|
||||
aabc
|
||||
aabcc
|
||||
aabbc
|
||||
aabbcc
|
||||
aaabbbccc
|
||||
ac
|
||||
aac
|
||||
aacc
|
||||
aaaccc
|
||||
bc
|
||||
bcc
|
||||
bbc
|
||||
bbcc
|
||||
bbbccc
|
||||
cc
|
||||
ccc
|
||||
cccccc
|
|
@ -0,0 +1,3 @@
|
|||
COMPOUNDMIN 1
|
||||
COMPOUNDRULE 1
|
||||
COMPOUNDRULE A*B*C*
|
|
@ -0,0 +1,5 @@
|
|||
3
|
||||
a/A
|
||||
b/B
|
||||
c/C
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
aa
|
||||
aaa
|
||||
aaaa
|
||||
ab
|
||||
aab
|
||||
aaab
|
||||
aaaab
|
||||
abb
|
||||
aabb
|
||||
aaabbb
|
||||
bb
|
||||
bbb
|
||||
bbbb
|
||||
aaab
|
||||
abc
|
||||
abcc
|
||||
abbc
|
||||
abbcc
|
||||
aabc
|
||||
aabcc
|
||||
aabbc
|
||||
aabbcc
|
||||
aaabbbccc
|
||||
ac
|
||||
acc
|
||||
aac
|
||||
aacc
|
||||
aaaccc
|
||||
bc
|
||||
bcc
|
||||
bbc
|
||||
bbcc
|
||||
bbbccc
|
||||
cc
|
||||
ccc
|
||||
cccccc
|
||||
abcc
|
|
@ -0,0 +1,8 @@
|
|||
ba
|
||||
aaabaaa
|
||||
bbaaa
|
||||
aaaaba
|
||||
bbbbbaa
|
||||
cba
|
||||
cab
|
||||
acb
|
|
@ -0,0 +1,3 @@
|
|||
COMPOUNDMIN 1
|
||||
COMPOUNDRULE 1
|
||||
COMPOUNDRULE A?B?C?
|
|
@ -0,0 +1,5 @@
|
|||
3
|
||||
a/A
|
||||
b/B
|
||||
c/C
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
a
|
||||
b
|
||||
c
|
||||
ab
|
||||
abc
|
||||
ac
|
||||
bc
|
|
@ -0,0 +1,41 @@
|
|||
aa
|
||||
aaa
|
||||
aaaa
|
||||
aab
|
||||
aaab
|
||||
aaaab
|
||||
abb
|
||||
aabb
|
||||
aaabbb
|
||||
bb
|
||||
bbb
|
||||
bbbb
|
||||
aaab
|
||||
abcc
|
||||
abbc
|
||||
abbcc
|
||||
aabc
|
||||
aabcc
|
||||
aabbc
|
||||
aabbcc
|
||||
aaabbbccc
|
||||
acc
|
||||
aac
|
||||
aacc
|
||||
aaaccc
|
||||
bcc
|
||||
bbc
|
||||
bbcc
|
||||
bbbccc
|
||||
cc
|
||||
ccc
|
||||
cccccc
|
||||
abcc
|
||||
ba
|
||||
aaabaaa
|
||||
bbaaa
|
||||
aaaaba
|
||||
bbbbbaa
|
||||
cba
|
||||
cab
|
||||
acb
|
|
@ -0,0 +1,7 @@
|
|||
# English ordinal numbers
|
||||
WORDCHARS 0123456789
|
||||
COMPOUNDMIN 1
|
||||
ONLYINCOMPOUND c
|
||||
COMPOUNDRULE 2
|
||||
COMPOUNDRULE n*1t
|
||||
COMPOUNDRULE n*mp
|
|
@ -0,0 +1,24 @@
|
|||
22
|
||||
0/nm
|
||||
1/n1
|
||||
2/nm
|
||||
3/nm
|
||||
4/nm
|
||||
5/nm
|
||||
6/nm
|
||||
7/nm
|
||||
8/nm
|
||||
9/nm
|
||||
0th/pt
|
||||
1st/p
|
||||
1th/tc
|
||||
2nd/p
|
||||
2th/tc
|
||||
3rd/p
|
||||
3th/tc
|
||||
4th/pt
|
||||
5th/pt
|
||||
6th/pt
|
||||
7th/pt
|
||||
8th/pt
|
||||
9th/pt
|
|
@ -0,0 +1,31 @@
|
|||
1st
|
||||
2nd
|
||||
3rd
|
||||
4th
|
||||
5th
|
||||
6th
|
||||
7th
|
||||
8th
|
||||
9th
|
||||
10th
|
||||
11th
|
||||
12th
|
||||
13th
|
||||
14th
|
||||
15th
|
||||
16th
|
||||
17th
|
||||
18th
|
||||
19th
|
||||
20th
|
||||
21st
|
||||
22nd
|
||||
23rd
|
||||
24th
|
||||
25th
|
||||
100th
|
||||
1000th
|
||||
10001st
|
||||
10011th
|
||||
1ST
|
||||
42ND
|
|
@ -0,0 +1,5 @@
|
|||
1th
|
||||
2th
|
||||
3th
|
||||
10001th
|
||||
10011st
|
|
@ -0,0 +1,7 @@
|
|||
# number + percent
|
||||
SET UTF-8
|
||||
COMPOUNDMIN 1
|
||||
COMPOUNDRULE 2
|
||||
COMPOUNDRULE N*%?
|
||||
COMPOUNDRULE NN*.NN*%?
|
||||
WORDCHARS 0123456789‰.
|
|
@ -0,0 +1,14 @@
|
|||
13
|
||||
0/N po:num
|
||||
1/N po:num
|
||||
2/N po:num
|
||||
3/N po:num
|
||||
4/N po:num
|
||||
5/N po:num
|
||||
6/N po:num
|
||||
7/N po:num
|
||||
8/N po:num
|
||||
9/N po:num
|
||||
./. po:sign_dot
|
||||
%/% po:sign_percent
|
||||
‰/% po:sign_per_mille
|
|
@ -0,0 +1,7 @@
|
|||
10%
|
||||
0.2%
|
||||
0.20%
|
||||
123.4561‰
|
||||
10
|
||||
0000
|
||||
10.25
|
|
@ -0,0 +1 @@
|
|||
.25
|
|
@ -0,0 +1,4 @@
|
|||
COMPOUNDMIN 1
|
||||
COMPOUNDRULE 2
|
||||
COMPOUNDRULE A*A
|
||||
COMPOUNDRULE A*AAB*BBBC*C
|
|
@ -0,0 +1,5 @@
|
|||
3
|
||||
a/A
|
||||
b/B
|
||||
c/C
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
aa
|
||||
aaaaaa
|
||||
aabbbc
|
||||
aaaaabbbbbbcccccc
|
|
@ -0,0 +1,4 @@
|
|||
abc
|
||||
abbbbbccccccc
|
||||
aabbccccccc
|
||||
aabbbbbbb
|
|
@ -0,0 +1,8 @@
|
|||
# English ordinal numbers (parenthesized long flags)
|
||||
FLAG long
|
||||
WORDCHARS 0123456789
|
||||
COMPOUNDMIN 1
|
||||
ONLYINCOMPOUND cc
|
||||
COMPOUNDRULE 2
|
||||
COMPOUNDRULE (nn)*(11)(tt)
|
||||
COMPOUNDRULE (nn)*(mm)(pp)
|
|
@ -0,0 +1,24 @@
|
|||
22
|
||||
0/nnmm
|
||||
1/nn11
|
||||
2/nnmm
|
||||
3/nnmm
|
||||
4/nnmm
|
||||
5/nnmm
|
||||
6/nnmm
|
||||
7/nnmm
|
||||
8/nnmm
|
||||
9/nnmm
|
||||
0th/pptt
|
||||
1st/pp
|
||||
1th/ttcc
|
||||
2nd/pp
|
||||
2th/ttcc
|
||||
3rd/pp
|
||||
3th/ttcc
|
||||
4th/pptt
|
||||
5th/pptt
|
||||
6th/pptt
|
||||
7th/pptt
|
||||
8th/pptt
|
||||
9th/pptt
|
|
@ -0,0 +1,29 @@
|
|||
1st
|
||||
2nd
|
||||
3rd
|
||||
4th
|
||||
5th
|
||||
6th
|
||||
7th
|
||||
8th
|
||||
9th
|
||||
10th
|
||||
11th
|
||||
12th
|
||||
13th
|
||||
14th
|
||||
15th
|
||||
16th
|
||||
17th
|
||||
18th
|
||||
19th
|
||||
20th
|
||||
21st
|
||||
22nd
|
||||
23rd
|
||||
24th
|
||||
25th
|
||||
100th
|
||||
1000th
|
||||
10001st
|
||||
10011th
|
|
@ -0,0 +1,5 @@
|
|||
1th
|
||||
2th
|
||||
3th
|
||||
10001th
|
||||
10011st
|
|
@ -0,0 +1,8 @@
|
|||
# English ordinal numbers (parenthesized numerical flags)
|
||||
FLAG num
|
||||
WORDCHARS 0123456789
|
||||
COMPOUNDMIN 1
|
||||
ONLYINCOMPOUND 1000
|
||||
COMPOUNDRULE 2
|
||||
COMPOUNDRULE (1001)*(1002)(2001)
|
||||
COMPOUNDRULE (1001)*(2002)(2000)
|
|
@ -0,0 +1,24 @@
|
|||
22
|
||||
0/1001,2002
|
||||
1/1001,1002
|
||||
2/1001,2002
|
||||
3/1001,2002
|
||||
4/1001,2002
|
||||
5/1001,2002
|
||||
6/1001,2002
|
||||
7/1001,2002
|
||||
8/1001,2002
|
||||
9/1001,2002
|
||||
0th/2000,2001
|
||||
1st/2000
|
||||
1th/2001,1000
|
||||
2nd/2000
|
||||
2th/2001,1000
|
||||
3rd/2000
|
||||
3th/2001,1000
|
||||
4th/2000,2001
|
||||
5th/2000,2001
|
||||
6th/2000,2001
|
||||
7th/2000,2001
|
||||
8th/2000,2001
|
||||
9th/2000,2001
|
|
@ -0,0 +1,29 @@
|
|||
1st
|
||||
2nd
|
||||
3rd
|
||||
4th
|
||||
5th
|
||||
6th
|
||||
7th
|
||||
8th
|
||||
9th
|
||||
10th
|
||||
11th
|
||||
12th
|
||||
13th
|
||||
14th
|
||||
15th
|
||||
16th
|
||||
17th
|
||||
18th
|
||||
19th
|
||||
20th
|
||||
21st
|
||||
22nd
|
||||
23rd
|
||||
24th
|
||||
25th
|
||||
100th
|
||||
1000th
|
||||
10001st
|
||||
10011th
|
|
@ -0,0 +1,5 @@
|
|||
1th
|
||||
2th
|
||||
3th
|
||||
10001th
|
||||
10011st
|
Loading…
Reference in New Issue