LUCENE-9701: Hunspell: implement simple REP-based suggestion algorithm (#2251)

This commit is contained in:
Peter Gromov 2021-02-01 10:23:54 +01:00 committed by GitHub
parent 9d45dfe776
commit 8a34cc7afd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 276 additions and 3 deletions

View File

@ -56,6 +56,7 @@ configure(project(":lucene:analysis:common")) {
"**/*.dic",
"**/*.wrong",
"**/*.good",
"**/*.sug",
"**/charfilter/*.htm*",
"**/*LuceneResourcesWikiPage.html"
]

View File

@ -152,6 +152,9 @@ public class Dictionary {
// ignored characters (dictionary, affix, inputs)
private char[] ignore;
String tryChars = "";
List<RepEntry> repTable = new ArrayList<>();
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
FST<CharsRef> iconv;
FST<CharsRef> oconv;
@ -383,6 +386,14 @@ public class Dictionary {
alternateCasing = langCode.equals("tr") || langCode.equals("az");
} else if ("BREAK".equals(firstWord)) {
breaks = parseBreaks(reader, line);
} else if ("TRY".equals(firstWord)) {
tryChars = singleArgument(reader, line);
} else if ("REP".equals(firstWord)) {
int count = Integer.parseInt(singleArgument(reader, line));
for (int i = 0; i < count; i++) {
String[] parts = splitBySpace(reader, reader.readLine(), 3);
repTable.add(new RepEntry(parts[1], parts[2]));
}
} else if ("FORBIDDENWORD".equals(firstWord)) {
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDMIN".equals(firstWord)) {

View File

@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.util.Arrays;
import java.util.LinkedHashSet;
class ModifyingSuggester {
private final LinkedHashSet<String> result = new LinkedHashSet<>();
private final char[] tryChars;
private final SpellChecker speller;
ModifyingSuggester(SpellChecker speller) {
this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray();
}
LinkedHashSet<String> suggest(String word) {
tryRep(word);
tryAddingChar(word);
return result;
}
private void tryRep(String word) {
for (RepEntry entry : speller.dictionary.repTable) {
for (String candidate : entry.substitute(word)) {
if (trySuggestion(candidate)) {
continue;
}
if (candidate.contains(" ")
&& Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
result.add(candidate);
}
}
}
}
private void tryAddingChar(String word) {
for (int i = 0; i <= word.length(); i++) {
String prefix = word.substring(0, i);
String suffix = word.substring(i);
for (char toInsert : tryChars) {
trySuggestion(prefix + toInsert + suffix);
}
}
}
private boolean trySuggestion(String candidate) {
if (speller.checkWord(candidate)) {
result.add(candidate);
return true;
}
return false;
}
}

View File

@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
class RepEntry {
private final String pattern;
private final String replacement;
private final boolean mustStart;
private final boolean mustEnd;
private final int patternLen;
RepEntry(String rawPattern, String rawReplacement) {
mustStart = rawPattern.startsWith("^");
mustEnd = rawPattern.endsWith("$");
pattern = rawPattern.substring(mustStart ? 1 : 0, rawPattern.length() - (mustEnd ? 1 : 0));
replacement = rawReplacement.replace('_', ' ');
patternLen = pattern.length();
}
List<String> substitute(String word) {
if (mustStart) {
boolean matches = mustEnd ? word.equals(pattern) : word.startsWith(pattern);
return matches
? Collections.singletonList(replacement + word.substring(patternLen))
: Collections.emptyList();
}
if (mustEnd) {
return word.endsWith(pattern)
? Collections.singletonList(word.substring(0, word.length() - patternLen) + replacement)
: Collections.emptyList();
}
int pos = word.indexOf(pattern);
if (pos < 0) return Collections.emptyList();
List<String> result = new ArrayList<>();
while (pos >= 0) {
result.add(word.substring(0, pos) + replacement + word.substring(pos + patternLen));
pos = word.indexOf(pattern, pos + 1);
}
return result;
}
@Override
public String toString() {
return (mustStart ? "^" : "") + pattern + (mustEnd ? "$" : "") + "->" + replacement;
}
}

View File

@ -23,7 +23,9 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
@ -34,9 +36,9 @@ import org.apache.lucene.util.IntsRef;
* threads). Not all Hunspell features are supported yet.
*/
public class SpellChecker {
private final Dictionary dictionary;
final Dictionary dictionary;
final Stemmer stemmer;
private final BytesRef scratch = new BytesRef();
private final Stemmer stemmer;
public SpellChecker(Dictionary dictionary) {
this.dictionary = dictionary;
@ -128,6 +130,10 @@ public class SpellChecker {
return false;
}
boolean checkWord(String word) {
return checkWord(word.toCharArray(), word.length(), null);
}
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
return false;
@ -329,4 +335,44 @@ public class SpellChecker {
&& spell(word.substring(0, breakPos))
&& spell(word.substring(breakPos + breakStr.length()));
}
public List<String> suggest(String word) {
if (word.length() >= 100) return Collections.emptyList();
if (dictionary.needsInputCleaning) {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
ModifyingSuggester modifier = new ModifyingSuggester(this);
Set<String> result = modifier.suggest(word);
if (word.contains("-") && result.stream().noneMatch(s -> s.contains("-"))) {
result.addAll(modifyChunksBetweenDashes(word));
}
return new ArrayList<>(result);
}
private List<String> modifyChunksBetweenDashes(String word) {
List<String> result = new ArrayList<>();
int chunkStart = 0;
while (chunkStart < word.length()) {
int chunkEnd = word.indexOf('-', chunkStart);
if (chunkEnd < 0) {
chunkEnd = word.length();
}
if (chunkEnd > chunkStart) {
String chunk = word.substring(chunkStart, chunkEnd);
if (!spell(chunk)) {
for (String chunkSug : suggest(chunk)) {
result.add(word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd));
}
}
}
chunkStart = chunkEnd + 1;
}
return result;
}
}

View File

@ -20,7 +20,9 @@ import java.io.InputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.util.IOUtils;
import org.junit.Test;
@ -46,6 +48,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("allcaps");
}
public void rep() throws Exception {
doTest("rep");
}
@Test
public void forceUCase() throws Exception {
doTest("forceucase");
@ -178,10 +184,22 @@ public class SpellCheckerTest extends StemmerTestBase {
}
URL wrong = StemmerTestBase.class.getResource(name + ".wrong");
URL sug = StemmerTestBase.class.getResource(name + ".sug");
if (wrong != null) {
for (String word : Files.readAllLines(Path.of(wrong.toURI()))) {
List<String> wrongWords = Files.readAllLines(Path.of(wrong.toURI()));
for (String word : wrongWords) {
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word));
}
if (sug != null) {
String suggestions =
wrongWords.stream()
.map(s -> String.join(", ", speller.suggest(s)))
.filter(s -> !s.isEmpty())
.collect(Collectors.joining("\n"));
assertEquals(Files.readString(Path.of(sug.toURI())).trim(), suggestions);
}
} else {
assertNull(".sug file without .wrong file!", sug);
}
}
}

View File

@ -0,0 +1,5 @@
scott
scot-free
foo-bar
foo-foo-bar
foo-foo-foo

View File

@ -0,0 +1,21 @@
# With REP suggestions, we can fix typical language specific misspellings.
# switch off ngram suggestion for testing
MAXNGRAMSUGS 0
REP 8
REP f ph
REP ph f
REP shun$ tion
REP ^alot$ a_lot # add the highest priority for "a lot" suggestion to "alot"
REP ^foo$ bar
REP ' _ # "un'alunno" -> "un alunno"
REP ^vinte<74>n$ vinte_e_un
REP s 's
SFX A Y 1
SFX A 0 's .
WORDCHARS '

View File

@ -0,0 +1,15 @@
10
form
phantom
vacation
vacations
a
lot
un
alunno
bar
barbars
vinte
e
un
auto/A

View File

@ -0,0 +1,8 @@
form
phantom
vacation
a lot, lot
un alunno
bar
vinte e un
auto's, auto

View File

@ -0,0 +1,11 @@
phorm
fantom
vacashun
vacashuns
alot
un'alunno
foo
foobars
barfoos
vinteún
autos