From d0ae2bd2b9c99c9922f5bb011c21f65dc3cc5c45 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Wed, 3 Feb 2021 08:58:40 +0100 Subject: [PATCH] LUCENE-9717: Hunspell: support CHECKCOMPOUNDPATTERN (#2280) --- .../hunspell/CheckCompoundPattern.java | 141 ++++++++++++++++++ .../lucene/analysis/hunspell/Dictionary.java | 7 + .../analysis/hunspell/SpellChecker.java | 90 ++++++++--- .../analysis/hunspell/SpellCheckerTest.java | 12 ++ .../hunspell/checkcompoundpattern.aff | 5 + .../hunspell/checkcompoundpattern.dic | 5 + .../hunspell/checkcompoundpattern.good | 2 + .../hunspell/checkcompoundpattern.wrong | 4 + .../hunspell/checkcompoundpattern2.aff | 7 + .../hunspell/checkcompoundpattern2.dic | 3 + .../hunspell/checkcompoundpattern2.good | 3 + .../hunspell/checkcompoundpattern2.wrong | 1 + .../hunspell/checkcompoundpattern3.aff | 6 + .../hunspell/checkcompoundpattern3.dic | 5 + .../hunspell/checkcompoundpattern3.good | 9 ++ .../hunspell/checkcompoundpattern3.wrong | 8 + 16 files changed, 287 insertions(+), 21 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.wrong create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.wrong create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.wrong diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java new file mode 100644 index 00000000000..44867bc5043 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.util.List; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; + +class CheckCompoundPattern { + private final char[] endChars; + private final char[] beginChars; + private final char[] replacement; + private final char[] endFlags; + private final char[] beginFlags; + private final Dictionary dictionary; + private final BytesRef scratch = new BytesRef(); + + CheckCompoundPattern( + String unparsed, Dictionary.FlagParsingStrategy strategy, Dictionary dictionary) { + this.dictionary = dictionary; + String[] parts = unparsed.split("\\s+"); + if (parts.length < 3) { + throw new IllegalArgumentException("Invalid pattern: " + unparsed); + } + + int flagSep = parts[1].indexOf("/"); + endChars = (flagSep < 0 ? parts[1] : parts[1].substring(0, flagSep)).toCharArray(); + endFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[1].substring(flagSep + 1)); + + flagSep = parts[2].indexOf("/"); + beginChars = (flagSep < 0 ? parts[2] : parts[2].substring(0, flagSep)).toCharArray(); + beginFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[2].substring(flagSep + 1)); + + replacement = parts.length == 3 ? null : parts[3].toCharArray(); + } + + @Override + public String toString() { + return new String(endChars) + + " " + + new String(beginChars) + + (replacement == null ? "" : " -> " + new String(replacement)); + } + + boolean prohibitsCompounding( + CharsRef word, int breakPos, List stemsBefore, List stemsAfter) { + if (isNonAffixedPattern(endChars)) { + if (stemsBefore.stream() + .noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) { + return false; + } + } else if (!charsMatch(word, breakPos - endChars.length, endChars)) { + return false; + } + + if (isNonAffixedPattern(beginChars)) { + if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) { + return false; + } + } else if (!charsMatch(word, breakPos, beginChars)) { + return false; + } + + if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) { + return false; + } + //noinspection RedundantIfStatement + if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) { + return false; + } + + return true; + } + + private static boolean isNonAffixedPattern(char[] pattern) { + return pattern.length == 1 && pattern[0] == '0'; + } + + private boolean hasStemWithFlags(List stems, char[] flags) { + for (CharsRef stem : stems) { + IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length); + if (forms != null && hasAllFlags(flags, forms)) { + return true; + } + } + return false; + } + + private boolean hasAllFlags(char[] flags, IntsRef forms) { + for (char flag : flags) { + if (!dictionary.hasFlag(forms, flag, scratch)) { + return false; + } + } + return true; + } + + CharsRef expandReplacement(CharsRef word, int breakPos) { + if (replacement != null && charsMatch(word, breakPos, replacement)) { + return new CharsRef( + word.subSequence(0, breakPos) + + new String(endChars) + + new String(beginChars) + + word.subSequence(breakPos + replacement.length, word.length)); + } + return null; + } + + int endLength() { + return endChars.length; + } + + private static boolean charsMatch(CharsRef word, int offset, char[] pattern) { + int len = pattern.length; + if (word.length - offset < len || offset < 0 || offset > word.length) { + return false; + } + + for (int i = 0; i < len; i++) { + if (word.chars[word.offset + offset + i] != pattern[i]) { + return false; + } + } + return true; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 10340b8acdb..47c57a3bd4c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -146,6 +146,7 @@ public class Dictionary { boolean checkCompoundTriple, simplifiedTriple; int compoundMin = 3, compoundMax = Integer.MAX_VALUE; List compoundRules; // nullable + List checkCompoundPatterns = new ArrayList<>(); // ignored characters (dictionary, affix, inputs) private char[] ignore; @@ -412,6 +413,12 @@ public class Dictionary { checkCompoundTriple = true; } else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) { simplifiedTriple = true; + } else if ("CHECKCOMPOUNDPATTERN".equals(firstWord)) { + int count = Integer.parseInt(singleArgument(reader, line)); + for (int i = 0; i < count; i++) { + checkCompoundPatterns.add( + new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this)); + } } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java index aa0aeac53ec..747b209fa32 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; +import java.util.function.Predicate; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.IntsRef; @@ -149,47 +150,94 @@ public class SpellChecker { } if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) { - return checkCompounds(wordChars, 0, length, originalCase, 0); + return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, 0, __ -> true); } return false; } private boolean checkCompounds( - char[] chars, int offset, int length, WordCase originalCase, int depth) { + CharsRef word, WordCase originalCase, int depth, Predicate> checkPatterns) { if (depth > dictionary.compoundMax - 2) return false; - int limit = length - dictionary.compoundMin + 1; + int limit = word.length - dictionary.compoundMin + 1; for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) { WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE; - int breakOffset = offset + breakPos; - if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) { - List stems = stemmer.doStem(chars, offset, breakPos, originalCase, context); + int breakOffset = word.offset + breakPos; + if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { + List stems = + stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context); if (stems.isEmpty() && dictionary.simplifiedTriple - && chars[breakOffset - 1] == chars[breakOffset]) { - stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context); + && word.chars[breakOffset - 1] == word.chars[breakOffset]) { + stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context); } - if (stems.isEmpty()) continue; + if (!stems.isEmpty() && checkPatterns.test(stems)) { + Predicate> nextCheck = checkNextPatterns(word, breakPos, stems); + if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) { + return true; + } + } + } - int remainingLength = length - breakPos; - List lastStems = - stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END); - if (!lastStems.isEmpty() - && !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems)) - && !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) { - return true; - } - - if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) { - return true; - } + if (checkCompoundPatternReplacements(word, breakPos, originalCase, depth)) { + return true; } } return false; } + private boolean checkCompoundPatternReplacements( + CharsRef word, int pos, WordCase originalCase, int depth) { + for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) { + CharsRef expanded = pattern.expandReplacement(word, pos); + if (expanded != null) { + WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE; + int breakPos = pos + pattern.endLength(); + List stems = + stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context); + if (!stems.isEmpty()) { + Predicate> nextCheck = + next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next); + if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) { + return true; + } + } + } + } + return false; + } + + private Predicate> checkNextPatterns( + CharsRef word, int breakPos, List stems) { + return nextStems -> + dictionary.checkCompoundPatterns.stream() + .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems)); + } + + private boolean checkCompoundsAfter( + CharsRef word, + int breakPos, + WordCase originalCase, + int depth, + List prevStems, + Predicate> checkPatterns) { + int remainingLength = word.length - breakPos; + int breakOffset = word.offset + breakPos; + List tailStems = + stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); + if (!tailStems.isEmpty() + && !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems)) + && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase) + && checkPatterns.test(tailStems)) { + return true; + } + + CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength); + return checkCompounds(tail, originalCase, depth + 1, checkPatterns); + } + private boolean hasForceUCaseProblem( char[] chars, int offset, int length, WordCase originalCase) { if (dictionary.forceUCase == FLAG_UNSET) return false; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java index cb568164a5d..49514ae6d8b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -64,6 +64,18 @@ public class SpellCheckerTest extends StemmerTestBase { doTest("i53643"); } + public void testCheckCompoundPattern() throws Exception { + doTest("checkcompoundpattern"); + } + + public void testCheckCompoundPattern2() throws Exception { + doTest("checkcompoundpattern2"); + } + + public void testCheckCompoundPattern3() throws Exception { + doTest("checkcompoundpattern3"); + } + public void testDotless_i() throws Exception { doTest("dotless_i"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.aff new file mode 100644 index 00000000000..dfda51af271 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.aff @@ -0,0 +1,5 @@ +# forbid compounds with spec. pattern at word bounds +COMPOUNDFLAG A +CHECKCOMPOUNDPATTERN 2 +CHECKCOMPOUNDPATTERN nny ny +CHECKCOMPOUNDPATTERN ssz sz diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.dic new file mode 100644 index 00000000000..09300f0bcd3 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.dic @@ -0,0 +1,5 @@ +4 +könny/A +nyelés/A +hossz/A +számítás/A diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.good new file mode 100644 index 00000000000..f40229e8fdf --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.good @@ -0,0 +1,2 @@ +könnyszámĂ­tás +hossznyelĂ©s diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.wrong new file mode 100644 index 00000000000..378faf940fe --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern.wrong @@ -0,0 +1,4 @@ +könnynyelĂ©s +hosszszámĂ­tás +hosszkönnynyelĂ©s +könnynyelĂ©shossz diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.aff new file mode 100644 index 00000000000..fdf6560b4f9 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.aff @@ -0,0 +1,7 @@ +# forbid compounds with spec. pattern at word bound and allow modificated form +# (for German and Indian languages) +COMPOUNDFLAG A +CHECKCOMPOUNDPATTERN 2 +CHECKCOMPOUNDPATTERN o b z +CHECKCOMPOUNDPATTERN oo ba u +COMPOUNDMIN 1 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.dic new file mode 100644 index 00000000000..8ac75f4fc5d --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.dic @@ -0,0 +1,3 @@ +2 +foo/A +bar/A diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.good new file mode 100644 index 00000000000..eaad4f902b9 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.good @@ -0,0 +1,3 @@ +barfoo +fozar +fur diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.wrong new file mode 100644 index 00000000000..323fae03f46 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern2.wrong @@ -0,0 +1 @@ +foobar diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.aff new file mode 100644 index 00000000000..4feb3db3829 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.aff @@ -0,0 +1,6 @@ +# forbid compounds with spec. pattern at word bound and allow modified form +# (for Indian languages) +COMPOUNDFLAG A +CHECKCOMPOUNDPATTERN 1 +CHECKCOMPOUNDPATTERN o/X b/Y z +COMPOUNDMIN 1 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.dic new file mode 100644 index 00000000000..6bd1b7fc9e5 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.dic @@ -0,0 +1,5 @@ +4 +foo/A +boo/AX +bar/A +ban/AY diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.good new file mode 100644 index 00000000000..6070eff5c52 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.good @@ -0,0 +1,9 @@ +bozan +barfoo +banfoo +banbar +foobar +fooban +foobanbar +boobar +boobarfoo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.wrong new file mode 100644 index 00000000000..41d8d374710 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checkcompoundpattern3.wrong @@ -0,0 +1,8 @@ +booban +boobanfoo +fozar +fozarfoo +fozan +fozanfoo +bozar +bozarfoo