mirror of https://github.com/apache/lucene.git
LUCENE-9701: Hunspell: implement simple REP-based suggestion algorithm (#2251)
This commit is contained in:
parent
9d45dfe776
commit
8a34cc7afd
|
@ -56,6 +56,7 @@ configure(project(":lucene:analysis:common")) {
|
|||
"**/*.dic",
|
||||
"**/*.wrong",
|
||||
"**/*.good",
|
||||
"**/*.sug",
|
||||
"**/charfilter/*.htm*",
|
||||
"**/*LuceneResourcesWikiPage.html"
|
||||
]
|
||||
|
|
|
@ -152,6 +152,9 @@ public class Dictionary {
|
|||
// ignored characters (dictionary, affix, inputs)
|
||||
private char[] ignore;
|
||||
|
||||
String tryChars = "";
|
||||
List<RepEntry> repTable = new ArrayList<>();
|
||||
|
||||
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
|
||||
FST<CharsRef> iconv;
|
||||
FST<CharsRef> oconv;
|
||||
|
@ -383,6 +386,14 @@ public class Dictionary {
|
|||
alternateCasing = langCode.equals("tr") || langCode.equals("az");
|
||||
} else if ("BREAK".equals(firstWord)) {
|
||||
breaks = parseBreaks(reader, line);
|
||||
} else if ("TRY".equals(firstWord)) {
|
||||
tryChars = singleArgument(reader, line);
|
||||
} else if ("REP".equals(firstWord)) {
|
||||
int count = Integer.parseInt(singleArgument(reader, line));
|
||||
for (int i = 0; i < count; i++) {
|
||||
String[] parts = splitBySpace(reader, reader.readLine(), 3);
|
||||
repTable.add(new RepEntry(parts[1], parts[2]));
|
||||
}
|
||||
} else if ("FORBIDDENWORD".equals(firstWord)) {
|
||||
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDMIN".equals(firstWord)) {
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashSet;
|
||||
|
||||
class ModifyingSuggester {
|
||||
private final LinkedHashSet<String> result = new LinkedHashSet<>();
|
||||
private final char[] tryChars;
|
||||
private final SpellChecker speller;
|
||||
|
||||
ModifyingSuggester(SpellChecker speller) {
|
||||
this.speller = speller;
|
||||
tryChars = speller.dictionary.tryChars.toCharArray();
|
||||
}
|
||||
|
||||
LinkedHashSet<String> suggest(String word) {
|
||||
tryRep(word);
|
||||
tryAddingChar(word);
|
||||
return result;
|
||||
}
|
||||
|
||||
private void tryRep(String word) {
|
||||
for (RepEntry entry : speller.dictionary.repTable) {
|
||||
for (String candidate : entry.substitute(word)) {
|
||||
if (trySuggestion(candidate)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (candidate.contains(" ")
|
||||
&& Arrays.stream(candidate.split(" ")).allMatch(speller::checkWord)) {
|
||||
result.add(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void tryAddingChar(String word) {
|
||||
for (int i = 0; i <= word.length(); i++) {
|
||||
String prefix = word.substring(0, i);
|
||||
String suffix = word.substring(i);
|
||||
for (char toInsert : tryChars) {
|
||||
trySuggestion(prefix + toInsert + suffix);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean trySuggestion(String candidate) {
|
||||
if (speller.checkWord(candidate)) {
|
||||
result.add(candidate);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
class RepEntry {
|
||||
private final String pattern;
|
||||
private final String replacement;
|
||||
private final boolean mustStart;
|
||||
private final boolean mustEnd;
|
||||
private final int patternLen;
|
||||
|
||||
RepEntry(String rawPattern, String rawReplacement) {
|
||||
mustStart = rawPattern.startsWith("^");
|
||||
mustEnd = rawPattern.endsWith("$");
|
||||
pattern = rawPattern.substring(mustStart ? 1 : 0, rawPattern.length() - (mustEnd ? 1 : 0));
|
||||
replacement = rawReplacement.replace('_', ' ');
|
||||
patternLen = pattern.length();
|
||||
}
|
||||
|
||||
List<String> substitute(String word) {
|
||||
if (mustStart) {
|
||||
boolean matches = mustEnd ? word.equals(pattern) : word.startsWith(pattern);
|
||||
return matches
|
||||
? Collections.singletonList(replacement + word.substring(patternLen))
|
||||
: Collections.emptyList();
|
||||
}
|
||||
|
||||
if (mustEnd) {
|
||||
return word.endsWith(pattern)
|
||||
? Collections.singletonList(word.substring(0, word.length() - patternLen) + replacement)
|
||||
: Collections.emptyList();
|
||||
}
|
||||
|
||||
int pos = word.indexOf(pattern);
|
||||
if (pos < 0) return Collections.emptyList();
|
||||
|
||||
List<String> result = new ArrayList<>();
|
||||
while (pos >= 0) {
|
||||
result.add(word.substring(0, pos) + replacement + word.substring(pos + patternLen));
|
||||
pos = word.indexOf(pattern, pos + 1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return (mustStart ? "^" : "") + pattern + (mustEnd ? "$" : "") + "->" + replacement;
|
||||
}
|
||||
}
|
|
@ -23,7 +23,9 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
|||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
@ -34,9 +36,9 @@ import org.apache.lucene.util.IntsRef;
|
|||
* threads). Not all Hunspell features are supported yet.
|
||||
*/
|
||||
public class SpellChecker {
|
||||
private final Dictionary dictionary;
|
||||
final Dictionary dictionary;
|
||||
final Stemmer stemmer;
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
private final Stemmer stemmer;
|
||||
|
||||
public SpellChecker(Dictionary dictionary) {
|
||||
this.dictionary = dictionary;
|
||||
|
@ -128,6 +130,10 @@ public class SpellChecker {
|
|||
return false;
|
||||
}
|
||||
|
||||
boolean checkWord(String word) {
|
||||
return checkWord(word.toCharArray(), word.length(), null);
|
||||
}
|
||||
|
||||
private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
|
||||
if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
|
||||
return false;
|
||||
|
@ -329,4 +335,44 @@ public class SpellChecker {
|
|||
&& spell(word.substring(0, breakPos))
|
||||
&& spell(word.substring(breakPos + breakStr.length()));
|
||||
}
|
||||
|
||||
public List<String> suggest(String word) {
|
||||
if (word.length() >= 100) return Collections.emptyList();
|
||||
|
||||
if (dictionary.needsInputCleaning) {
|
||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
ModifyingSuggester modifier = new ModifyingSuggester(this);
|
||||
Set<String> result = modifier.suggest(word);
|
||||
|
||||
if (word.contains("-") && result.stream().noneMatch(s -> s.contains("-"))) {
|
||||
result.addAll(modifyChunksBetweenDashes(word));
|
||||
}
|
||||
|
||||
return new ArrayList<>(result);
|
||||
}
|
||||
|
||||
private List<String> modifyChunksBetweenDashes(String word) {
|
||||
List<String> result = new ArrayList<>();
|
||||
int chunkStart = 0;
|
||||
while (chunkStart < word.length()) {
|
||||
int chunkEnd = word.indexOf('-', chunkStart);
|
||||
if (chunkEnd < 0) {
|
||||
chunkEnd = word.length();
|
||||
}
|
||||
|
||||
if (chunkEnd > chunkStart) {
|
||||
String chunk = word.substring(chunkStart, chunkEnd);
|
||||
if (!spell(chunk)) {
|
||||
for (String chunkSug : suggest(chunk)) {
|
||||
result.add(word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chunkStart = chunkEnd + 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,9 @@ import java.io.InputStream;
|
|||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.junit.Test;
|
||||
|
@ -46,6 +48,10 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
doTest("allcaps");
|
||||
}
|
||||
|
||||
public void rep() throws Exception {
|
||||
doTest("rep");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void forceUCase() throws Exception {
|
||||
doTest("forceucase");
|
||||
|
@ -178,10 +184,22 @@ public class SpellCheckerTest extends StemmerTestBase {
|
|||
}
|
||||
|
||||
URL wrong = StemmerTestBase.class.getResource(name + ".wrong");
|
||||
URL sug = StemmerTestBase.class.getResource(name + ".sug");
|
||||
if (wrong != null) {
|
||||
for (String word : Files.readAllLines(Path.of(wrong.toURI()))) {
|
||||
List<String> wrongWords = Files.readAllLines(Path.of(wrong.toURI()));
|
||||
for (String word : wrongWords) {
|
||||
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word));
|
||||
}
|
||||
if (sug != null) {
|
||||
String suggestions =
|
||||
wrongWords.stream()
|
||||
.map(s -> String.join(", ", speller.suggest(s)))
|
||||
.filter(s -> !s.isEmpty())
|
||||
.collect(Collectors.joining("\n"));
|
||||
assertEquals(Files.readString(Path.of(sug.toURI())).trim(), suggestions);
|
||||
}
|
||||
} else {
|
||||
assertNull(".sug file without .wrong file!", sug);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
scott
|
||||
scot-free
|
||||
foo-bar
|
||||
foo-foo-bar
|
||||
foo-foo-foo
|
|
@ -0,0 +1,21 @@
|
|||
# With REP suggestions, we can fix typical language specific misspellings.
|
||||
|
||||
# switch off ngram suggestion for testing
|
||||
MAXNGRAMSUGS 0
|
||||
|
||||
REP 8
|
||||
REP f ph
|
||||
REP ph f
|
||||
REP shun$ tion
|
||||
REP ^alot$ a_lot # add the highest priority for "a lot" suggestion to "alot"
|
||||
REP ^foo$ bar
|
||||
REP ' _ # "un'alunno" -> "un alunno"
|
||||
REP ^vinte<74>n$ vinte_e_un
|
||||
REP s 's
|
||||
|
||||
|
||||
SFX A Y 1
|
||||
SFX A 0 's .
|
||||
|
||||
|
||||
WORDCHARS '
|
|
@ -0,0 +1,15 @@
|
|||
10
|
||||
form
|
||||
phantom
|
||||
vacation
|
||||
vacations
|
||||
a
|
||||
lot
|
||||
un
|
||||
alunno
|
||||
bar
|
||||
barbars
|
||||
vinte
|
||||
e
|
||||
un
|
||||
auto/A
|
|
@ -0,0 +1,8 @@
|
|||
form
|
||||
phantom
|
||||
vacation
|
||||
a lot, lot
|
||||
un alunno
|
||||
bar
|
||||
vinte e un
|
||||
auto's, auto
|
|
@ -0,0 +1,11 @@
|
|||
phorm
|
||||
fantom
|
||||
vacashun
|
||||
vacashuns
|
||||
alot
|
||||
un'alunno
|
||||
foo
|
||||
foobars
|
||||
barfoos
|
||||
vinteún
|
||||
autos
|
Loading…
Reference in New Issue