mirror of https://github.com/apache/lucene.git
LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBIDDENWORD affix rules (#2207)
This commit is contained in:
parent
a233ed2fd1
commit
939699f550
|
@ -54,6 +54,8 @@ configure(project(":lucene:analysis:common")) {
|
|||
srcExcludes += [
|
||||
"**/*.aff",
|
||||
"**/*.dic",
|
||||
"**/*.wrong",
|
||||
"**/*.good",
|
||||
"**/charfilter/*.htm*",
|
||||
"**/*LuceneResourcesWikiPage.html"
|
||||
]
|
||||
|
|
|
@ -86,8 +86,8 @@ API Changes
|
|||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
|
||||
(Peter Gromov)
|
||||
* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and
|
||||
BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov)
|
||||
|
||||
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||
(Dawid Weiss)
|
||||
|
|
|
@ -34,13 +34,16 @@ import java.nio.file.Paths;
|
|||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -87,6 +90,8 @@ public class Dictionary {
|
|||
private static final String OCONV_KEY = "OCONV";
|
||||
private static final String FULLSTRIP_KEY = "FULLSTRIP";
|
||||
private static final String LANG_KEY = "LANG";
|
||||
private static final String BREAK_KEY = "BREAK";
|
||||
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
|
||||
private static final String KEEPCASE_KEY = "KEEPCASE";
|
||||
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
|
||||
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
|
||||
|
@ -103,6 +108,7 @@ public class Dictionary {
|
|||
|
||||
FST<IntsRef> prefixes;
|
||||
FST<IntsRef> suffixes;
|
||||
Breaks breaks = Breaks.DEFAULT;
|
||||
|
||||
// all condition checks used by prefixes and suffixes. these are typically re-used across
|
||||
// many affix stripping rules. so these are deduplicated, to save RAM.
|
||||
|
@ -155,6 +161,7 @@ public class Dictionary {
|
|||
int circumfix = -1; // circumfix flag, or -1 if one is not defined
|
||||
int keepcase = -1; // keepcase flag, or -1 if one is not defined
|
||||
int needaffix = -1; // needaffix flag, or -1 if one is not defined
|
||||
int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined
|
||||
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
|
||||
|
||||
// ignored characters (dictionary, affix, inputs)
|
||||
|
@ -256,6 +263,10 @@ public class Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
int formStep() {
|
||||
return hasStemExceptions ? 2 : 1;
|
||||
}
|
||||
|
||||
/** Looks up Hunspell word forms from the dictionary */
|
||||
IntsRef lookupWord(char[] word, int offset, int length) {
|
||||
return lookup(words, word, offset, length);
|
||||
|
@ -400,6 +411,14 @@ public class Dictionary {
|
|||
} else if (line.startsWith(LANG_KEY)) {
|
||||
language = line.substring(LANG_KEY.length()).trim();
|
||||
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
|
||||
} else if (line.startsWith(BREAK_KEY)) {
|
||||
breaks = parseBreaks(reader, line);
|
||||
} else if (line.startsWith(FORBIDDENWORD_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
|
||||
}
|
||||
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -423,6 +442,30 @@ public class Dictionary {
|
|||
stripOffsets[currentIndex] = currentOffset;
|
||||
}
|
||||
|
||||
private Breaks parseBreaks(LineNumberReader reader, String line)
|
||||
throws IOException, ParseException {
|
||||
Set<String> starting = new LinkedHashSet<>();
|
||||
Set<String> ending = new LinkedHashSet<>();
|
||||
Set<String> middle = new LinkedHashSet<>();
|
||||
int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
|
||||
for (int i = 0; i < num; i++) {
|
||||
line = reader.readLine();
|
||||
String[] parts = line.split("\\s+");
|
||||
if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
|
||||
throw new ParseException("BREAK chars expected", reader.getLineNumber());
|
||||
}
|
||||
String breakStr = parts[1];
|
||||
if (breakStr.startsWith("^")) {
|
||||
starting.add(breakStr.substring(1));
|
||||
} else if (breakStr.endsWith("$")) {
|
||||
ending.add(breakStr.substring(0, breakStr.length() - 1));
|
||||
} else {
|
||||
middle.add(breakStr);
|
||||
}
|
||||
}
|
||||
return new Breaks(starting, ending, middle);
|
||||
}
|
||||
|
||||
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
|
||||
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
||||
FSTCompiler<IntsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
|
@ -1143,6 +1186,22 @@ public class Dictionary {
|
|||
return null;
|
||||
}
|
||||
|
||||
boolean isForbiddenWord(char[] word, BytesRef scratch) {
|
||||
if (forbiddenword != -1) {
|
||||
IntsRef forms = lookupWord(word, 0, word.length);
|
||||
if (forms != null) {
|
||||
int formStep = formStep();
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
flagLookup.get(forms.ints[forms.offset + i], scratch);
|
||||
if (hasFlag(Dictionary.decodeFlags(scratch), (char) forbiddenword)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Abstraction of the process of parsing flags taken from the affix and dic files */
|
||||
abstract static class FlagParsingStrategy {
|
||||
|
||||
|
@ -1371,4 +1430,21 @@ public class Dictionary {
|
|||
|
||||
return DEFAULT_TEMP_DIR;
|
||||
}
|
||||
|
||||
/** Possible word breaks according to BREAK directives */
|
||||
static class Breaks {
|
||||
private static final Set<String> MINUS = Collections.singleton("-");
|
||||
static final Breaks DEFAULT = new Breaks(MINUS, MINUS, MINUS);
|
||||
final String[] starting, ending, middle;
|
||||
|
||||
Breaks(Collection<String> starting, Collection<String> ending, Collection<String> middle) {
|
||||
this.starting = starting.toArray(new String[0]);
|
||||
this.ending = ending.toArray(new String[0]);
|
||||
this.middle = middle.toArray(new String[0]);
|
||||
}
|
||||
|
||||
boolean isNotEmpty() {
|
||||
return middle.length > 0 || starting.length > 0 || ending.length > 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe
|
||||
* (but a single underlying Dictionary can be shared by multiple spell-checkers in different
|
||||
* threads). Not all Hunspell features are supported yet.
|
||||
*/
|
||||
public class SpellChecker {
|
||||
private final Dictionary dictionary;
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
private final Stemmer stemmer;
|
||||
|
||||
public SpellChecker(Dictionary dictionary) {
|
||||
this.dictionary = dictionary;
|
||||
stemmer = new Stemmer(dictionary);
|
||||
}
|
||||
|
||||
/** @return whether the given word's spelling is considered correct according to Hunspell rules */
|
||||
public boolean spell(String word) {
|
||||
char[] wordChars = word.toCharArray();
|
||||
if (dictionary.isForbiddenWord(wordChars, scratch)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!stemmer.stem(wordChars, word.length()).isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
|
||||
return tryBreaks(word);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean tryBreaks(String word) {
|
||||
for (String br : dictionary.breaks.starting) {
|
||||
if (word.length() > br.length() && word.startsWith(br)) {
|
||||
if (spell(word.substring(br.length()))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (String br : dictionary.breaks.ending) {
|
||||
if (word.length() > br.length() && word.endsWith(br)) {
|
||||
if (spell(word.substring(0, word.length() - br.length()))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (String br : dictionary.breaks.middle) {
|
||||
int pos = word.indexOf(br);
|
||||
if (canBeBrokenAt(word, br, pos)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// try to break at the second occurrence
|
||||
// to recognize dictionary words with a word break
|
||||
if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean hasTooManyBreakOccurrences(String word) {
|
||||
int occurrences = 0;
|
||||
for (String br : dictionary.breaks.middle) {
|
||||
int pos = 0;
|
||||
while ((pos = word.indexOf(br, pos)) >= 0) {
|
||||
if (++occurrences >= 10) return true;
|
||||
pos += br.length();
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean canBeBrokenAt(String word, String breakStr, int breakPos) {
|
||||
return breakPos > 0
|
||||
&& breakPos < word.length() - breakStr.length()
|
||||
&& spell(word.substring(0, breakPos))
|
||||
&& spell(word.substring(breakPos + breakStr.length()));
|
||||
}
|
||||
}
|
|
@ -64,7 +64,7 @@ final class Stemmer {
|
|||
suffixReaders[level] = dictionary.suffixes.getBytesReader();
|
||||
}
|
||||
}
|
||||
formStep = dictionary.hasStemExceptions ? 2 : 1;
|
||||
formStep = dictionary.formStep();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public class SpellCheckerTest extends StemmerTestBase {
|
||||
|
||||
public void testBreak() throws Exception {
|
||||
doTest("break");
|
||||
}
|
||||
|
||||
public void testBreakDefault() throws Exception {
|
||||
doTest("breakdefault");
|
||||
}
|
||||
|
||||
public void testBreakOff() throws Exception {
|
||||
doTest("breakoff");
|
||||
}
|
||||
|
||||
protected void doTest(String name) throws Exception {
|
||||
InputStream affixStream =
|
||||
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
|
||||
InputStream dictStream =
|
||||
Objects.requireNonNull(getClass().getResourceAsStream(name + ".dic"), name);
|
||||
|
||||
SpellChecker speller;
|
||||
try {
|
||||
Dictionary dictionary =
|
||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
||||
speller = new SpellChecker(dictionary);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(affixStream);
|
||||
IOUtils.closeWhileHandlingException(dictStream);
|
||||
}
|
||||
|
||||
URL good = StemmerTestBase.class.getResource(name + ".good");
|
||||
if (good != null) {
|
||||
for (String word : Files.readAllLines(Path.of(good.toURI()))) {
|
||||
assertTrue("Unexpectedly considered misspelled: " + word, speller.spell(word));
|
||||
}
|
||||
}
|
||||
|
||||
URL wrong = StemmerTestBase.class.getResource(name + ".wrong");
|
||||
if (wrong != null) {
|
||||
for (String word : Files.readAllLines(Path.of(wrong.toURI()))) {
|
||||
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
# word break points test, recursive break at dash and n-dash
|
||||
SET UTF-8
|
||||
|
||||
BREAK 2
|
||||
BREAK -
|
||||
BREAK –
|
||||
|
||||
WORDCHARS -–
|
||||
|
||||
FORBIDDENWORD !
|
|
@ -0,0 +1,7 @@
|
|||
6
|
||||
foo
|
||||
bar
|
||||
baz
|
||||
fox-bax
|
||||
foo-baz/!
|
||||
e-mail
|
|
@ -0,0 +1,12 @@
|
|||
foo
|
||||
bar
|
||||
fox-bax
|
||||
foo-bar
|
||||
foo–bar
|
||||
foo-bar-foo-bar
|
||||
foo-bar–foo-bar
|
||||
bar-baz
|
||||
baz-foo
|
||||
foo-bar-foo-bar-foo-bar-foo-bar-foo-bar
|
||||
e-mail
|
||||
e-mail-foo
|
|
@ -0,0 +1,13 @@
|
|||
fox
|
||||
bax
|
||||
-foo
|
||||
bar-
|
||||
fox-bar
|
||||
foo-bax
|
||||
foo–bax
|
||||
fox–bar
|
||||
foo-bar-fox-bar
|
||||
foo-bax-foo-bar
|
||||
foo-bar–fox-bar
|
||||
foo-bax–foo-bar
|
||||
foo-baz
|
|
@ -0,0 +1,6 @@
|
|||
# default word break at hyphens and n-dashes
|
||||
|
||||
SET UTF-8
|
||||
MAXNGRAMSUGS 0
|
||||
WORDCHARS -
|
||||
TRY ot
|
|
@ -0,0 +1,6 @@
|
|||
3
|
||||
foo
|
||||
bar
|
||||
free
|
||||
scott
|
||||
scot-free
|
|
@ -0,0 +1,7 @@
|
|||
foo
|
||||
bar
|
||||
foo-
|
||||
-foo
|
||||
scot-free
|
||||
foo-bar
|
||||
foo-bar-foo-bar
|
|
@ -0,0 +1,6 @@
|
|||
scot
|
||||
sco-free
|
||||
fo-bar
|
||||
foo-fo-bar
|
||||
foo-foo-fo
|
||||
-
|
|
@ -0,0 +1,7 @@
|
|||
# switch off default word break at hyphens and n-dashes by BREAK 0
|
||||
SET UTF-8
|
||||
MAXNGRAMSUGS 0
|
||||
WORDCHARS -
|
||||
TRY ot
|
||||
|
||||
BREAK 0
|
|
@ -0,0 +1,6 @@
|
|||
3
|
||||
foo
|
||||
bar
|
||||
free
|
||||
scott
|
||||
scot-free
|
|
@ -0,0 +1,3 @@
|
|||
foo
|
||||
bar
|
||||
scot-free
|
|
@ -0,0 +1,5 @@
|
|||
foo-
|
||||
-foo
|
||||
foo-bar
|
||||
foo-bar-foo-bar
|
||||
scot
|
Loading…
Reference in New Issue