diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fce70e979ee..f99553b963f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -86,8 +86,8 @@ API Changes Improvements -* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and - BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov) +* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and + BREAK/FORBIDDENWORD/COMPOUNDRULE affix rules, improve stemming of all-caps words (Peter Gromov) * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (Dawid Weiss) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java new file mode 100644 index 00000000000..0f89de80e24 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CompoundRule.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.util.List; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +class CompoundRule { + private final char[] data; + private final Dictionary dictionary; + + CompoundRule(String rule, Dictionary dictionary) { + this.dictionary = dictionary; + StringBuilder parsedFlags = new StringBuilder(); + int pos = 0; + while (pos < rule.length()) { + int lParen = rule.indexOf("(", pos); + if (lParen < 0) { + parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos))); + break; + } + + parsedFlags.append(dictionary.flagParsingStrategy.parseFlags(rule.substring(pos, lParen))); + int rParen = rule.indexOf(')', lParen + 1); + if (rParen < 0) { + throw new IllegalArgumentException("Unmatched parentheses: " + rule); + } + + parsedFlags.append( + dictionary.flagParsingStrategy.parseFlags(rule.substring(lParen + 1, rParen))); + pos = rParen + 1; + if (pos < rule.length() && (rule.charAt(pos) == '?' || rule.charAt(pos) == '*')) { + parsedFlags.append(rule.charAt(pos++)); + } + } + data = parsedFlags.toString().toCharArray(); + } + + boolean mayMatch(List words, BytesRef scratch) { + return match(words, 0, 0, scratch, false); + } + + boolean fullyMatches(List words, BytesRef scratch) { + return match(words, 0, 0, scratch, true); + } + + private boolean match( + List words, int patternIndex, int wordIndex, BytesRef scratch, boolean fully) { + if (patternIndex >= data.length) { + return wordIndex >= words.size(); + } + if (wordIndex >= words.size() && !fully) { + return true; + } + + char flag = data[patternIndex]; + if (patternIndex < data.length - 1 && data[patternIndex + 1] == '*') { + int startWI = wordIndex; + while (wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch)) { + wordIndex++; + } + + while (wordIndex >= startWI) { + if (match(words, patternIndex + 2, wordIndex, scratch, fully)) { + return true; + } + + wordIndex--; + } + return false; + } + + boolean currentWordMatches = + wordIndex < words.size() && dictionary.hasFlag(words.get(wordIndex), flag, scratch); + + if (patternIndex < data.length - 1 && data[patternIndex + 1] == '?') { + if (currentWordMatches && match(words, patternIndex + 2, wordIndex + 1, scratch, fully)) { + return true; + } + return match(words, patternIndex + 2, wordIndex, scratch, fully); + } + + return currentWordMatches && match(words, patternIndex + 1, wordIndex + 1, scratch, fully); + } + + @Override + public String toString() { + return new String(data); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 19cfaa36490..2c620a2f8bd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -92,6 +92,8 @@ public class Dictionary { private static final String LANG_KEY = "LANG"; private static final String BREAK_KEY = "BREAK"; private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD"; + private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN"; + private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE"; private static final String KEEPCASE_KEY = "KEEPCASE"; private static final String NEEDAFFIX_KEY = "NEEDAFFIX"; private static final String PSEUDOROOT_KEY = "PSEUDOROOT"; @@ -136,7 +138,7 @@ public class Dictionary { static final int AFFIX_APPEND = 3; // Default flag parsing strategy - private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); + FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // AF entries private String[] aliases; @@ -163,6 +165,8 @@ public class Dictionary { int needaffix = -1; // needaffix flag, or -1 if one is not defined int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined + int compoundMin = 3; + List compoundRules; // nullable // ignored characters (dictionary, affix, inputs) private char[] ignore; @@ -419,6 +423,18 @@ public class Dictionary { throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber()); } forbiddenword = flagParsingStrategy.parseFlag(parts[1]); + } else if (line.startsWith(COMPOUNDMIN_KEY)) { + String[] parts = line.split("\\s+"); + if (parts.length != 2) { + throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber()); + } + compoundMin = Math.max(1, Integer.parseInt(parts[1])); + } else if (line.startsWith(COMPOUNDRULE_KEY)) { + String[] parts = line.split("\\s+"); + if (parts.length != 2) { + throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber()); + } + this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1])); } } @@ -442,6 +458,21 @@ public class Dictionary { stripOffsets[currentIndex] = currentOffset; } + private List parseCompoundRules(LineNumberReader reader, int num) + throws IOException, ParseException { + String line; + List compoundRules = new ArrayList<>(); + for (int i = 0; i < num; i++) { + line = reader.readLine(); + String[] parts = line.split("\\s+"); + if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) { + throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber()); + } + compoundRules.add(new CompoundRule(parts[1], this)); + } + return compoundRules; + } + private Breaks parseBreaks(LineNumberReader reader, String line) throws IOException, ParseException { Set starting = new LinkedHashSet<>(); @@ -910,7 +941,7 @@ public class Dictionary { reuse.append(caseFold(word.charAt(i))); } reuse.append(FLAG_SEPARATOR); - reuse.append(HIDDEN_FLAG); + flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse); reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length()); writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8)); } @@ -1188,16 +1219,19 @@ public class Dictionary { return null; } - boolean isForbiddenWord(char[] word, BytesRef scratch) { + boolean isForbiddenWord(char[] word, int length, BytesRef scratch) { if (forbiddenword != -1) { - IntsRef forms = lookupWord(word, 0, word.length); - if (forms != null) { - int formStep = formStep(); - for (int i = 0; i < forms.length; i += formStep) { - if (hasFlag(forms.ints[forms.offset + i], (char) forbiddenword, scratch)) { - return true; - } - } + IntsRef forms = lookupWord(word, 0, length); + return forms != null && hasFlag(forms, (char) forbiddenword, scratch); + } + return false; + } + + boolean hasFlag(IntsRef forms, char flag, BytesRef scratch) { + int formStep = formStep(); + for (int i = 0; i < forms.length; i += formStep) { + if (hasFlag(forms.ints[forms.offset + i], flag, scratch)) { + return true; } } return false; @@ -1227,6 +1261,8 @@ public class Dictionary { * @return Parsed flags */ abstract char[] parseFlags(String rawFlags); + + abstract void appendFlag(char flag, StringBuilder to); } /** @@ -1238,6 +1274,11 @@ public class Dictionary { public char[] parseFlags(String rawFlags) { return rawFlags.toCharArray(); } + + @Override + void appendFlag(char flag, StringBuilder to) { + to.append(flag); + } } /** @@ -1266,6 +1307,14 @@ public class Dictionary { } return flags; } + + @Override + void appendFlag(char flag, StringBuilder to) { + if (to.length() > 0) { + to.append(","); + } + to.append((int) flag); + } } /** @@ -1300,6 +1349,16 @@ public class Dictionary { builder.getChars(0, builder.length(), flags, 0); return flags; } + + @Override + void appendFlag(char flag, StringBuilder to) { + to.append((char) (flag >> 8)); + to.append((char) (flag & 0xff)); + } + } + + boolean hasCompounding() { + return compoundRules != null; } boolean hasFlag(int entryId, char flag, BytesRef scratch) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index a3e765b9477..66e21a16527 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -16,7 +16,10 @@ */ package org.apache.lucene.analysis.hunspell; +import java.util.ArrayList; +import java.util.List; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /** * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe @@ -37,26 +40,100 @@ public class SpellChecker { public boolean spell(String word) { if (word.isEmpty()) return true; - char[] wordChars = word.toCharArray(); - if (dictionary.isForbiddenWord(wordChars, scratch)) { - return false; + if (dictionary.needsInputCleaning) { + word = dictionary.cleanInput(word, new StringBuilder()).toString(); } if (isNumber(word)) { return true; } - if (!stemmer.stem(wordChars, word.length()).isEmpty()) { + char[] wordChars = word.toCharArray(); + if (checkWord(wordChars, wordChars.length, false)) { return true; } - if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { + WordCase wc = stemmer.caseOf(wordChars, wordChars.length); + if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) { + return true; + } + + if (dictionary.breaks.isNotEmpty() + && !hasTooManyBreakOccurrences(word) + && !dictionary.isForbiddenWord(wordChars, word.length(), scratch)) { return tryBreaks(word); } return false; } + private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) { + char[] caseVariant = wordChars; + if (wordCase == WordCase.UPPER) { + caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length); + if (checkWord(caseVariant, wordChars.length, true)) { + return true; + } + } + return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true); + } + + private boolean checkWord(char[] wordChars, int length, boolean caseVariant) { + if (dictionary.isForbiddenWord(wordChars, length, scratch)) { + return false; + } + + if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) { + return true; + } + + if (dictionary.hasCompounding()) { + return checkCompounds(wordChars, 0, length, new ArrayList<>()); + } + + return false; + } + + private boolean checkCompounds(char[] wordChars, int offset, int length, List words) { + if (words.size() >= 100) return false; + + int limit = length - dictionary.compoundMin + 1; + for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { + IntsRef forms = dictionary.lookupWord(wordChars, offset, breakPos); + if (forms != null) { + words.add(forms); + + if (dictionary.compoundRules != null + && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words, scratch))) { + if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) { + return true; + } + + if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) { + return true; + } + } + + words.remove(words.size() - 1); + } + } + + return false; + } + + private boolean checkLastCompoundPart( + char[] wordChars, int start, int length, List words) { + IntsRef forms = dictionary.lookupWord(wordChars, start, length); + if (forms == null) return false; + + words.add(forms); + boolean result = + dictionary.compoundRules != null + && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch)); + words.remove(words.size() - 1); + return result; + } + private static boolean isNumber(String s) { int i = 0; while (i < s.length()) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 1355627ad76..3bb46a7a961 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -112,8 +112,8 @@ final class Stemmer { private char[] titleBuffer = new char[8]; /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */ - private WordCase caseOf(char[] word, int length) { - if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) { + WordCase caseOf(char[] word, int length) { + if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) { return WordCase.MIXED; } @@ -121,22 +121,24 @@ final class Stemmer { } /** folds titlecase variant of word to titleBuffer */ - private void caseFoldTitle(char[] word, int length) { + char[] caseFoldTitle(char[] word, int length) { titleBuffer = ArrayUtil.grow(titleBuffer, length); System.arraycopy(word, 0, titleBuffer, 0, length); for (int i = 1; i < length; i++) { titleBuffer[i] = dictionary.caseFold(titleBuffer[i]); } + return titleBuffer; } /** folds lowercase variant of word (title cased) to lowerBuffer */ - private void caseFoldLower(char[] word, int length) { + char[] caseFoldLower(char[] word, int length) { lowerBuffer = ArrayUtil.grow(lowerBuffer, length); System.arraycopy(word, 0, lowerBuffer, 0, length); lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]); + return lowerBuffer; } - private List doStem(char[] word, int length, boolean caseVariant) { + List doStem(char[] word, int length, boolean caseVariant) { List stems = new ArrayList<>(); IntsRef forms = dictionary.lookupWord(word, 0, length); if (forms != null) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java index 7d9e2e75873..04adf7a9ae9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java @@ -23,7 +23,7 @@ enum WordCase { MIXED; static WordCase caseOf(char[] word, int length) { - boolean capitalized = Character.isUpperCase(word[0]); + boolean startsWithLower = Character.isLowerCase(word[0]); boolean seenUpper = false; boolean seenLower = false; @@ -34,11 +34,11 @@ enum WordCase { if (seenUpper && seenLower) break; } - return get(capitalized, seenUpper, seenLower); + return get(startsWithLower, seenUpper, seenLower); } static WordCase caseOf(CharSequence word, int length) { - boolean capitalized = Character.isUpperCase(word.charAt(0)); + boolean startsWithLower = Character.isLowerCase(word.charAt(0)); boolean seenUpper = false; boolean seenLower = false; @@ -49,11 +49,11 @@ enum WordCase { if (seenUpper && seenLower) break; } - return get(capitalized, seenUpper, seenLower); + return get(startsWithLower, seenUpper, seenLower); } - private static WordCase get(boolean capitalized, boolean seenUpper, boolean seenLower) { - if (capitalized) { + private static WordCase get(boolean startsWithLower, boolean seenUpper, boolean seenLower) { + if (!startsWithLower) { return !seenLower ? UPPER : !seenUpper ? TITLE : MIXED; } return seenUpper ? MIXED : LOWER; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index a478dda09bf..cfa1719f18a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -43,6 +43,38 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("breakoff"); } + public void testCompoundrule() throws Exception { + doTest("compoundrule"); + } + + public void testCompoundrule2() throws Exception { + doTest("compoundrule2"); + } + + public void testCompoundrule3() throws Exception { + doTest("compoundrule3"); + } + + public void testCompoundrule4() throws Exception { + doTest("compoundrule4"); + } + + public void testCompoundrule5() throws Exception { + doTest("compoundrule5"); + } + + public void testCompoundrule6() throws Exception { + doTest("compoundrule6"); + } + + public void testCompoundrule7() throws Exception { + doTest("compoundrule7"); + } + + public void testCompoundrule8() throws Exception { + doTest("compoundrule8"); + } + protected void doTest(String name) throws Exception { InputStream affixStream = Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 5e8fdff5341..a0ece7809e3 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.text.ParseException; +import java.util.Random; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; @@ -33,6 +34,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; +import org.junit.Test; public class TestDictionary extends LuceneTestCase { @@ -268,6 +270,27 @@ public class TestDictionary extends LuceneTestCase { assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8")); } + @Test + public void testFlagSerialization() { + Random r = random(); + char[] flags = new char[r.nextInt(10)]; + for (int i = 0; i < flags.length; i++) { + flags[i] = (char) r.nextInt(Character.MAX_VALUE); + } + + String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"}; + for (String flagLine : flagLines) { + Dictionary.FlagParsingStrategy strategy = Dictionary.getFlagParsingStrategy(flagLine); + StringBuilder serialized = new StringBuilder(); + for (char flag : flags) { + strategy.appendFlag(flag, serialized); + } + + char[] deserialized = strategy.parseFlags(serialized.toString()); + assertEquals(new String(flags), new String(deserialized)); + } + } + private Directory getDirectory() { return newDirectory(); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff new file mode 100644 index 00000000000..09309e0aabc --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.aff @@ -0,0 +1,3 @@ +COMPOUNDMIN 1 +COMPOUNDRULE 1 +COMPOUNDRULE ABC diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic new file mode 100644 index 00000000000..b11e8291e67 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.dic @@ -0,0 +1,5 @@ +3 +a/A +b/B +c/BC + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good new file mode 100644 index 00000000000..c7a0763bb1d --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.good @@ -0,0 +1,2 @@ +abc +acc diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong new file mode 100644 index 00000000000..bc151ea0293 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule.wrong @@ -0,0 +1,39 @@ +ba +aaabaaa +bbaaa +aaaaba +bbbbbaa +aa +aaa +aaaa +ab +aab +aaab +aaaab +abb +aabb +aaabbb +bb +bbb +bbbb +aaab +abcc +abbc +abbcc +aabc +aabcc +aabbc +aabbcc +aaabbbccc +ac +aac +aacc +aaaccc +bc +bcc +bbc +bbcc +bbbccc +cc +ccc +cccccc diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff new file mode 100644 index 00000000000..e4b86a53b4e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.aff @@ -0,0 +1,3 @@ +COMPOUNDMIN 1 +COMPOUNDRULE 1 +COMPOUNDRULE A*B*C* diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic new file mode 100644 index 00000000000..7d07bbc89a8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.dic @@ -0,0 +1,5 @@ +3 +a/A +b/B +c/C + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good new file mode 100644 index 00000000000..de743bb0679 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.good @@ -0,0 +1,37 @@ +aa +aaa +aaaa +ab +aab +aaab +aaaab +abb +aabb +aaabbb +bb +bbb +bbbb +aaab +abc +abcc +abbc +abbcc +aabc +aabcc +aabbc +aabbcc +aaabbbccc +ac +acc +aac +aacc +aaaccc +bc +bcc +bbc +bbcc +bbbccc +cc +ccc +cccccc +abcc diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong new file mode 100644 index 00000000000..9e5d38d3502 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule2.wrong @@ -0,0 +1,8 @@ +ba +aaabaaa +bbaaa +aaaaba +bbbbbaa +cba +cab +acb diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff new file mode 100644 index 00000000000..005314586c6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.aff @@ -0,0 +1,3 @@ +COMPOUNDMIN 1 +COMPOUNDRULE 1 +COMPOUNDRULE A?B?C? diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic new file mode 100644 index 00000000000..7d07bbc89a8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.dic @@ -0,0 +1,5 @@ +3 +a/A +b/B +c/C + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good new file mode 100644 index 00000000000..7f518893e94 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.good @@ -0,0 +1,7 @@ +a +b +c +ab +abc +ac +bc diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong new file mode 100644 index 00000000000..6bd1d8004ab --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule3.wrong @@ -0,0 +1,41 @@ +aa +aaa +aaaa +aab +aaab +aaaab +abb +aabb +aaabbb +bb +bbb +bbbb +aaab +abcc +abbc +abbcc +aabc +aabcc +aabbc +aabbcc +aaabbbccc +acc +aac +aacc +aaaccc +bcc +bbc +bbcc +bbbccc +cc +ccc +cccccc +abcc +ba +aaabaaa +bbaaa +aaaaba +bbbbbaa +cba +cab +acb diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff new file mode 100644 index 00000000000..8a9996cb3e2 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff @@ -0,0 +1,7 @@ +# English ordinal numbers +WORDCHARS 0123456789 +COMPOUNDMIN 1 +ONLYINCOMPOUND c +COMPOUNDRULE 2 +COMPOUNDRULE n*1t +COMPOUNDRULE n*mp diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic new file mode 100644 index 00000000000..ced0735ec1e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic @@ -0,0 +1,24 @@ +22 +0/nm +1/n1 +2/nm +3/nm +4/nm +5/nm +6/nm +7/nm +8/nm +9/nm +0th/pt +1st/p +1th/tc +2nd/p +2th/tc +3rd/p +3th/tc +4th/pt +5th/pt +6th/pt +7th/pt +8th/pt +9th/pt diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good new file mode 100644 index 00000000000..86949437d38 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good @@ -0,0 +1,31 @@ +1st +2nd +3rd +4th +5th +6th +7th +8th +9th +10th +11th +12th +13th +14th +15th +16th +17th +18th +19th +20th +21st +22nd +23rd +24th +25th +100th +1000th +10001st +10011th +1ST +42ND \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong new file mode 100644 index 00000000000..99f28e7cc3e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.wrong @@ -0,0 +1,5 @@ +1th +2th +3th +10001th +10011st diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff new file mode 100644 index 00000000000..46502460bc8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.aff @@ -0,0 +1,7 @@ +# number + percent +SET UTF-8 +COMPOUNDMIN 1 +COMPOUNDRULE 2 +COMPOUNDRULE N*%? +COMPOUNDRULE NN*.NN*%? +WORDCHARS 0123456789‰. diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic new file mode 100644 index 00000000000..eeeffdac503 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.dic @@ -0,0 +1,14 @@ +13 +0/N po:num +1/N po:num +2/N po:num +3/N po:num +4/N po:num +5/N po:num +6/N po:num +7/N po:num +8/N po:num +9/N po:num +./. po:sign_dot +%/% po:sign_percent +‰/% po:sign_per_mille diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good new file mode 100644 index 00000000000..691fca1fb9f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.good @@ -0,0 +1,7 @@ +10% +0.2% +0.20% +123.4561‰ +10 +0000 +10.25 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong new file mode 100644 index 00000000000..ba1fe3290f2 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule5.wrong @@ -0,0 +1 @@ +.25 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff new file mode 100644 index 00000000000..e8a088d5a7a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.aff @@ -0,0 +1,4 @@ +COMPOUNDMIN 1 +COMPOUNDRULE 2 +COMPOUNDRULE A*A +COMPOUNDRULE A*AAB*BBBC*C diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic new file mode 100644 index 00000000000..7d07bbc89a8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.dic @@ -0,0 +1,5 @@ +3 +a/A +b/B +c/C + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good new file mode 100644 index 00000000000..55a8f8bc5fa --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.good @@ -0,0 +1,4 @@ +aa +aaaaaa +aabbbc +aaaaabbbbbbcccccc diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong new file mode 100644 index 00000000000..48b376dac50 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule6.wrong @@ -0,0 +1,4 @@ +abc +abbbbbccccccc +aabbccccccc +aabbbbbbb diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff new file mode 100644 index 00000000000..3ae1fc78473 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.aff @@ -0,0 +1,8 @@ +# English ordinal numbers (parenthesized long flags) +FLAG long +WORDCHARS 0123456789 +COMPOUNDMIN 1 +ONLYINCOMPOUND cc +COMPOUNDRULE 2 +COMPOUNDRULE (nn)*(11)(tt) +COMPOUNDRULE (nn)*(mm)(pp) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic new file mode 100644 index 00000000000..ad4bb4d284a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.dic @@ -0,0 +1,24 @@ +22 +0/nnmm +1/nn11 +2/nnmm +3/nnmm +4/nnmm +5/nnmm +6/nnmm +7/nnmm +8/nnmm +9/nnmm +0th/pptt +1st/pp +1th/ttcc +2nd/pp +2th/ttcc +3rd/pp +3th/ttcc +4th/pptt +5th/pptt +6th/pptt +7th/pptt +8th/pptt +9th/pptt diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good new file mode 100644 index 00000000000..fafe64a5ca5 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.good @@ -0,0 +1,29 @@ +1st +2nd +3rd +4th +5th +6th +7th +8th +9th +10th +11th +12th +13th +14th +15th +16th +17th +18th +19th +20th +21st +22nd +23rd +24th +25th +100th +1000th +10001st +10011th diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong new file mode 100644 index 00000000000..99f28e7cc3e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule7.wrong @@ -0,0 +1,5 @@ +1th +2th +3th +10001th +10011st diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff new file mode 100644 index 00000000000..03a423d486a --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.aff @@ -0,0 +1,8 @@ +# English ordinal numbers (parenthesized numerical flags) +FLAG num +WORDCHARS 0123456789 +COMPOUNDMIN 1 +ONLYINCOMPOUND 1000 +COMPOUNDRULE 2 +COMPOUNDRULE (1001)*(1002)(2001) +COMPOUNDRULE (1001)*(2002)(2000) diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic new file mode 100644 index 00000000000..e156e95fe0e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.dic @@ -0,0 +1,24 @@ +22 +0/1001,2002 +1/1001,1002 +2/1001,2002 +3/1001,2002 +4/1001,2002 +5/1001,2002 +6/1001,2002 +7/1001,2002 +8/1001,2002 +9/1001,2002 +0th/2000,2001 +1st/2000 +1th/2001,1000 +2nd/2000 +2th/2001,1000 +3rd/2000 +3th/2001,1000 +4th/2000,2001 +5th/2000,2001 +6th/2000,2001 +7th/2000,2001 +8th/2000,2001 +9th/2000,2001 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good new file mode 100644 index 00000000000..fafe64a5ca5 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.good @@ -0,0 +1,29 @@ +1st +2nd +3rd +4th +5th +6th +7th +8th +9th +10th +11th +12th +13th +14th +15th +16th +17th +18th +19th +20th +21st +22nd +23rd +24th +25th +100th +1000th +10001st +10011th diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong new file mode 100644 index 00000000000..99f28e7cc3e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule8.wrong @@ -0,0 +1,5 @@ +1th +2th +3th +10001th +10011st