mirror of https://github.com/apache/lucene.git
LUCENE-9717: Hunspell: support CHECKCOMPOUNDPATTERN (#2280)
This commit is contained in:
parent
6509a3003c
commit
d0ae2bd2b9
|
@ -0,0 +1,141 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
class CheckCompoundPattern {
|
||||||
|
private final char[] endChars;
|
||||||
|
private final char[] beginChars;
|
||||||
|
private final char[] replacement;
|
||||||
|
private final char[] endFlags;
|
||||||
|
private final char[] beginFlags;
|
||||||
|
private final Dictionary dictionary;
|
||||||
|
private final BytesRef scratch = new BytesRef();
|
||||||
|
|
||||||
|
CheckCompoundPattern(
|
||||||
|
String unparsed, Dictionary.FlagParsingStrategy strategy, Dictionary dictionary) {
|
||||||
|
this.dictionary = dictionary;
|
||||||
|
String[] parts = unparsed.split("\\s+");
|
||||||
|
if (parts.length < 3) {
|
||||||
|
throw new IllegalArgumentException("Invalid pattern: " + unparsed);
|
||||||
|
}
|
||||||
|
|
||||||
|
int flagSep = parts[1].indexOf("/");
|
||||||
|
endChars = (flagSep < 0 ? parts[1] : parts[1].substring(0, flagSep)).toCharArray();
|
||||||
|
endFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[1].substring(flagSep + 1));
|
||||||
|
|
||||||
|
flagSep = parts[2].indexOf("/");
|
||||||
|
beginChars = (flagSep < 0 ? parts[2] : parts[2].substring(0, flagSep)).toCharArray();
|
||||||
|
beginFlags = flagSep < 0 ? new char[0] : strategy.parseFlags(parts[2].substring(flagSep + 1));
|
||||||
|
|
||||||
|
replacement = parts.length == 3 ? null : parts[3].toCharArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return new String(endChars)
|
||||||
|
+ " "
|
||||||
|
+ new String(beginChars)
|
||||||
|
+ (replacement == null ? "" : " -> " + new String(replacement));
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean prohibitsCompounding(
|
||||||
|
CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) {
|
||||||
|
if (isNonAffixedPattern(endChars)) {
|
||||||
|
if (stemsBefore.stream()
|
||||||
|
.noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isNonAffixedPattern(beginChars)) {
|
||||||
|
if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else if (!charsMatch(word, breakPos, beginChars)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
//noinspection RedundantIfStatement
|
||||||
|
if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isNonAffixedPattern(char[] pattern) {
|
||||||
|
return pattern.length == 1 && pattern[0] == '0';
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) {
|
||||||
|
for (CharsRef stem : stems) {
|
||||||
|
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
|
||||||
|
if (forms != null && hasAllFlags(flags, forms)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean hasAllFlags(char[] flags, IntsRef forms) {
|
||||||
|
for (char flag : flags) {
|
||||||
|
if (!dictionary.hasFlag(forms, flag, scratch)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
CharsRef expandReplacement(CharsRef word, int breakPos) {
|
||||||
|
if (replacement != null && charsMatch(word, breakPos, replacement)) {
|
||||||
|
return new CharsRef(
|
||||||
|
word.subSequence(0, breakPos)
|
||||||
|
+ new String(endChars)
|
||||||
|
+ new String(beginChars)
|
||||||
|
+ word.subSequence(breakPos + replacement.length, word.length));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
int endLength() {
|
||||||
|
return endChars.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean charsMatch(CharsRef word, int offset, char[] pattern) {
|
||||||
|
int len = pattern.length;
|
||||||
|
if (word.length - offset < len || offset < 0 || offset > word.length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
if (word.chars[word.offset + offset + i] != pattern[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -146,6 +146,7 @@ public class Dictionary {
|
||||||
boolean checkCompoundTriple, simplifiedTriple;
|
boolean checkCompoundTriple, simplifiedTriple;
|
||||||
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
|
int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
|
||||||
List<CompoundRule> compoundRules; // nullable
|
List<CompoundRule> compoundRules; // nullable
|
||||||
|
List<CheckCompoundPattern> checkCompoundPatterns = new ArrayList<>();
|
||||||
|
|
||||||
// ignored characters (dictionary, affix, inputs)
|
// ignored characters (dictionary, affix, inputs)
|
||||||
private char[] ignore;
|
private char[] ignore;
|
||||||
|
@ -412,6 +413,12 @@ public class Dictionary {
|
||||||
checkCompoundTriple = true;
|
checkCompoundTriple = true;
|
||||||
} else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
|
} else if ("SIMPLIFIEDTRIPLE".equals(firstWord)) {
|
||||||
simplifiedTriple = true;
|
simplifiedTriple = true;
|
||||||
|
} else if ("CHECKCOMPOUNDPATTERN".equals(firstWord)) {
|
||||||
|
int count = Integer.parseInt(singleArgument(reader, line));
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
checkCompoundPatterns.add(
|
||||||
|
new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,7 @@ import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
@ -149,47 +150,94 @@ public class SpellChecker {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
|
if (dictionary.compoundBegin != FLAG_UNSET || dictionary.compoundFlag != FLAG_UNSET) {
|
||||||
return checkCompounds(wordChars, 0, length, originalCase, 0);
|
return checkCompounds(new CharsRef(wordChars, 0, length), originalCase, 0, __ -> true);
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkCompounds(
|
private boolean checkCompounds(
|
||||||
char[] chars, int offset, int length, WordCase originalCase, int depth) {
|
CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) {
|
||||||
if (depth > dictionary.compoundMax - 2) return false;
|
if (depth > dictionary.compoundMax - 2) return false;
|
||||||
|
|
||||||
int limit = length - dictionary.compoundMin + 1;
|
int limit = word.length - dictionary.compoundMin + 1;
|
||||||
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
|
||||||
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||||
int breakOffset = offset + breakPos;
|
int breakOffset = word.offset + breakPos;
|
||||||
if (mayBreakIntoCompounds(chars, offset, length, breakOffset)) {
|
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
|
||||||
List<CharsRef> stems = stemmer.doStem(chars, offset, breakPos, originalCase, context);
|
List<CharsRef> stems =
|
||||||
|
stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context);
|
||||||
if (stems.isEmpty()
|
if (stems.isEmpty()
|
||||||
&& dictionary.simplifiedTriple
|
&& dictionary.simplifiedTriple
|
||||||
&& chars[breakOffset - 1] == chars[breakOffset]) {
|
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
|
||||||
stems = stemmer.doStem(chars, offset, breakPos + 1, originalCase, context);
|
stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context);
|
||||||
}
|
}
|
||||||
if (stems.isEmpty()) continue;
|
if (!stems.isEmpty() && checkPatterns.test(stems)) {
|
||||||
|
Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems);
|
||||||
|
if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int remainingLength = length - breakPos;
|
if (checkCompoundPatternReplacements(word, breakPos, originalCase, depth)) {
|
||||||
List<CharsRef> lastStems =
|
return true;
|
||||||
stemmer.doStem(chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
|
||||||
if (!lastStems.isEmpty()
|
|
||||||
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(stems, lastStems))
|
|
||||||
&& !hasForceUCaseProblem(chars, breakOffset, remainingLength, originalCase)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean checkCompoundPatternReplacements(
|
||||||
|
CharsRef word, int pos, WordCase originalCase, int depth) {
|
||||||
|
for (CheckCompoundPattern pattern : dictionary.checkCompoundPatterns) {
|
||||||
|
CharsRef expanded = pattern.expandReplacement(word, pos);
|
||||||
|
if (expanded != null) {
|
||||||
|
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||||
|
int breakPos = pos + pattern.endLength();
|
||||||
|
List<CharsRef> stems =
|
||||||
|
stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
|
||||||
|
if (!stems.isEmpty()) {
|
||||||
|
Predicate<List<CharsRef>> nextCheck =
|
||||||
|
next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next);
|
||||||
|
if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Predicate<List<CharsRef>> checkNextPatterns(
|
||||||
|
CharsRef word, int breakPos, List<CharsRef> stems) {
|
||||||
|
return nextStems ->
|
||||||
|
dictionary.checkCompoundPatterns.stream()
|
||||||
|
.noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems));
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean checkCompoundsAfter(
|
||||||
|
CharsRef word,
|
||||||
|
int breakPos,
|
||||||
|
WordCase originalCase,
|
||||||
|
int depth,
|
||||||
|
List<CharsRef> prevStems,
|
||||||
|
Predicate<List<CharsRef>> checkPatterns) {
|
||||||
|
int remainingLength = word.length - breakPos;
|
||||||
|
int breakOffset = word.offset + breakPos;
|
||||||
|
List<CharsRef> tailStems =
|
||||||
|
stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||||
|
if (!tailStems.isEmpty()
|
||||||
|
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems))
|
||||||
|
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
|
||||||
|
&& checkPatterns.test(tailStems)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
CharsRef tail = new CharsRef(word.chars, breakOffset, remainingLength);
|
||||||
|
return checkCompounds(tail, originalCase, depth + 1, checkPatterns);
|
||||||
|
}
|
||||||
|
|
||||||
private boolean hasForceUCaseProblem(
|
private boolean hasForceUCaseProblem(
|
||||||
char[] chars, int offset, int length, WordCase originalCase) {
|
char[] chars, int offset, int length, WordCase originalCase) {
|
||||||
if (dictionary.forceUCase == FLAG_UNSET) return false;
|
if (dictionary.forceUCase == FLAG_UNSET) return false;
|
||||||
|
|
|
@ -64,6 +64,18 @@ public class SpellCheckerTest extends StemmerTestBase {
|
||||||
doTest("i53643");
|
doTest("i53643");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCheckCompoundPattern() throws Exception {
|
||||||
|
doTest("checkcompoundpattern");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCheckCompoundPattern2() throws Exception {
|
||||||
|
doTest("checkcompoundpattern2");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCheckCompoundPattern3() throws Exception {
|
||||||
|
doTest("checkcompoundpattern3");
|
||||||
|
}
|
||||||
|
|
||||||
public void testDotless_i() throws Exception {
|
public void testDotless_i() throws Exception {
|
||||||
doTest("dotless_i");
|
doTest("dotless_i");
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
# forbid compounds with spec. pattern at word bounds
|
||||||
|
COMPOUNDFLAG A
|
||||||
|
CHECKCOMPOUNDPATTERN 2
|
||||||
|
CHECKCOMPOUNDPATTERN nny ny
|
||||||
|
CHECKCOMPOUNDPATTERN ssz sz
|
|
@ -0,0 +1,5 @@
|
||||||
|
4
|
||||||
|
könny/A
|
||||||
|
nyelés/A
|
||||||
|
hossz/A
|
||||||
|
számítás/A
|
|
@ -0,0 +1,2 @@
|
||||||
|
könnyszámítás
|
||||||
|
hossznyelés
|
|
@ -0,0 +1,4 @@
|
||||||
|
könnynyelés
|
||||||
|
hosszszámítás
|
||||||
|
hosszkönnynyelés
|
||||||
|
könnynyeléshossz
|
|
@ -0,0 +1,7 @@
|
||||||
|
# forbid compounds with spec. pattern at word bound and allow modificated form
|
||||||
|
# (for German and Indian languages)
|
||||||
|
COMPOUNDFLAG A
|
||||||
|
CHECKCOMPOUNDPATTERN 2
|
||||||
|
CHECKCOMPOUNDPATTERN o b z
|
||||||
|
CHECKCOMPOUNDPATTERN oo ba u
|
||||||
|
COMPOUNDMIN 1
|
|
@ -0,0 +1,3 @@
|
||||||
|
2
|
||||||
|
foo/A
|
||||||
|
bar/A
|
|
@ -0,0 +1,3 @@
|
||||||
|
barfoo
|
||||||
|
fozar
|
||||||
|
fur
|
|
@ -0,0 +1 @@
|
||||||
|
foobar
|
|
@ -0,0 +1,6 @@
|
||||||
|
# forbid compounds with spec. pattern at word bound and allow modified form
|
||||||
|
# (for Indian languages)
|
||||||
|
COMPOUNDFLAG A
|
||||||
|
CHECKCOMPOUNDPATTERN 1
|
||||||
|
CHECKCOMPOUNDPATTERN o/X b/Y z
|
||||||
|
COMPOUNDMIN 1
|
|
@ -0,0 +1,5 @@
|
||||||
|
4
|
||||||
|
foo/A
|
||||||
|
boo/AX
|
||||||
|
bar/A
|
||||||
|
ban/AY
|
|
@ -0,0 +1,9 @@
|
||||||
|
bozan
|
||||||
|
barfoo
|
||||||
|
banfoo
|
||||||
|
banbar
|
||||||
|
foobar
|
||||||
|
fooban
|
||||||
|
foobanbar
|
||||||
|
boobar
|
||||||
|
boobarfoo
|
|
@ -0,0 +1,8 @@
|
||||||
|
booban
|
||||||
|
boobanfoo
|
||||||
|
fozar
|
||||||
|
fozarfoo
|
||||||
|
fozan
|
||||||
|
fozanfoo
|
||||||
|
bozar
|
||||||
|
bozarfoo
|
Loading…
Reference in New Issue