From 939699f5509673c5d7d3e650ebda3d240b259044 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Wed, 20 Jan 2021 10:57:27 +0100 Subject: [PATCH] LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBIDDENWORD affix rules (#2207) --- gradle/validation/rat-sources.gradle | 2 + lucene/CHANGES.txt | 4 +- .../lucene/analysis/hunspell/Dictionary.java | 76 +++++++++++++ .../analysis/hunspell/SpellChecker.java | 104 ++++++++++++++++++ .../lucene/analysis/hunspell/Stemmer.java | 2 +- .../analysis/hunspell/SpellCheckerTest.java | 71 ++++++++++++ .../apache/lucene/analysis/hunspell/break.aff | 10 ++ .../apache/lucene/analysis/hunspell/break.dic | 7 ++ .../lucene/analysis/hunspell/break.good | 12 ++ .../lucene/analysis/hunspell/break.wrong | 13 +++ .../lucene/analysis/hunspell/breakdefault.aff | 6 + .../lucene/analysis/hunspell/breakdefault.dic | 6 + .../analysis/hunspell/breakdefault.good | 7 ++ .../analysis/hunspell/breakdefault.wrong | 6 + .../lucene/analysis/hunspell/breakoff.aff | 7 ++ .../lucene/analysis/hunspell/breakoff.dic | 6 + .../lucene/analysis/hunspell/breakoff.good | 3 + .../lucene/analysis/hunspell/breakoff.wrong | 5 + 18 files changed, 344 insertions(+), 3 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong diff --git a/gradle/validation/rat-sources.gradle b/gradle/validation/rat-sources.gradle index 3298b956d82..9454f0f6f75 100644 --- a/gradle/validation/rat-sources.gradle +++ b/gradle/validation/rat-sources.gradle @@ -54,6 +54,8 @@ configure(project(":lucene:analysis:common")) { srcExcludes += [ "**/*.aff", "**/*.dic", + "**/*.wrong", + "**/*.good", "**/charfilter/*.htm*", "**/*LuceneResourcesWikiPage.html" ] diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 92c0c3211bb..fa1d09cffa6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -86,8 +86,8 @@ API Changes Improvements -* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words - (Peter Gromov) +* LUCENE-9665 LUCENE-9676 LUCENE-9667 : Hunspell improvements: add SpellChecker API, support default encoding and + BREAK/FORBIDDENWORD affix rules, improve stemming of all-caps words (Peter Gromov) * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (Dawid Weiss) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 34edb73c1a1..bf4b1d06eb3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -34,13 +34,16 @@ import java.nio.file.Paths; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -87,6 +90,8 @@ public class Dictionary { private static final String OCONV_KEY = "OCONV"; private static final String FULLSTRIP_KEY = "FULLSTRIP"; private static final String LANG_KEY = "LANG"; + private static final String BREAK_KEY = "BREAK"; + private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD"; private static final String KEEPCASE_KEY = "KEEPCASE"; private static final String NEEDAFFIX_KEY = "NEEDAFFIX"; private static final String PSEUDOROOT_KEY = "PSEUDOROOT"; @@ -103,6 +108,7 @@ public class Dictionary { FST prefixes; FST suffixes; + Breaks breaks = Breaks.DEFAULT; // all condition checks used by prefixes and suffixes. these are typically re-used across // many affix stripping rules. so these are deduplicated, to save RAM. @@ -155,6 +161,7 @@ public class Dictionary { int circumfix = -1; // circumfix flag, or -1 if one is not defined int keepcase = -1; // keepcase flag, or -1 if one is not defined int needaffix = -1; // needaffix flag, or -1 if one is not defined + int forbiddenword = -1; // forbiddenword flag, or -1 if one is not defined int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined // ignored characters (dictionary, affix, inputs) @@ -256,6 +263,10 @@ public class Dictionary { } } + int formStep() { + return hasStemExceptions ? 2 : 1; + } + /** Looks up Hunspell word forms from the dictionary */ IntsRef lookupWord(char[] word, int offset, int length) { return lookup(words, word, offset, length); @@ -400,6 +411,14 @@ public class Dictionary { } else if (line.startsWith(LANG_KEY)) { language = line.substring(LANG_KEY.length()).trim(); alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language); + } else if (line.startsWith(BREAK_KEY)) { + breaks = parseBreaks(reader, line); + } else if (line.startsWith(FORBIDDENWORD_KEY)) { + String[] parts = line.split("\\s+"); + if (parts.length != 2) { + throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber()); + } + forbiddenword = flagParsingStrategy.parseFlag(parts[1]); } } @@ -423,6 +442,30 @@ public class Dictionary { stripOffsets[currentIndex] = currentOffset; } + private Breaks parseBreaks(LineNumberReader reader, String line) + throws IOException, ParseException { + Set starting = new LinkedHashSet<>(); + Set ending = new LinkedHashSet<>(); + Set middle = new LinkedHashSet<>(); + int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim()); + for (int i = 0; i < num; i++) { + line = reader.readLine(); + String[] parts = line.split("\\s+"); + if (!line.startsWith(BREAK_KEY) || parts.length != 2) { + throw new ParseException("BREAK chars expected", reader.getLineNumber()); + } + String breakStr = parts[1]; + if (breakStr.startsWith("^")) { + starting.add(breakStr.substring(1)); + } else if (breakStr.endsWith("$")) { + ending.add(breakStr.substring(0, breakStr.length() - 1)); + } else { + middle.add(breakStr); + } + } + return new Breaks(starting, ending, middle); + } + private FST affixFST(TreeMap> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs); @@ -1143,6 +1186,22 @@ public class Dictionary { return null; } + boolean isForbiddenWord(char[] word, BytesRef scratch) { + if (forbiddenword != -1) { + IntsRef forms = lookupWord(word, 0, word.length); + if (forms != null) { + int formStep = formStep(); + for (int i = 0; i < forms.length; i += formStep) { + flagLookup.get(forms.ints[forms.offset + i], scratch); + if (hasFlag(Dictionary.decodeFlags(scratch), (char) forbiddenword)) { + return true; + } + } + } + } + return false; + } + /** Abstraction of the process of parsing flags taken from the affix and dic files */ abstract static class FlagParsingStrategy { @@ -1371,4 +1430,21 @@ public class Dictionary { return DEFAULT_TEMP_DIR; } + + /** Possible word breaks according to BREAK directives */ + static class Breaks { + private static final Set MINUS = Collections.singleton("-"); + static final Breaks DEFAULT = new Breaks(MINUS, MINUS, MINUS); + final String[] starting, ending, middle; + + Breaks(Collection starting, Collection ending, Collection middle) { + this.starting = starting.toArray(new String[0]); + this.ending = ending.toArray(new String[0]); + this.middle = middle.toArray(new String[0]); + } + + boolean isNotEmpty() { + return middle.length > 0 || starting.length > 0 || ending.length > 0; + } + } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java new file mode 100644 index 00000000000..741fdc4acae --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import org.apache.lucene.util.BytesRef; + +/** + * A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe + * (but a single underlying Dictionary can be shared by multiple spell-checkers in different + * threads). Not all Hunspell features are supported yet. + */ +public class SpellChecker { + private final Dictionary dictionary; + private final BytesRef scratch = new BytesRef(); + private final Stemmer stemmer; + + public SpellChecker(Dictionary dictionary) { + this.dictionary = dictionary; + stemmer = new Stemmer(dictionary); + } + + /** @return whether the given word's spelling is considered correct according to Hunspell rules */ + public boolean spell(String word) { + char[] wordChars = word.toCharArray(); + if (dictionary.isForbiddenWord(wordChars, scratch)) { + return false; + } + + if (!stemmer.stem(wordChars, word.length()).isEmpty()) { + return true; + } + + if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { + return tryBreaks(word); + } + + return false; + } + + private boolean tryBreaks(String word) { + for (String br : dictionary.breaks.starting) { + if (word.length() > br.length() && word.startsWith(br)) { + if (spell(word.substring(br.length()))) { + return true; + } + } + } + + for (String br : dictionary.breaks.ending) { + if (word.length() > br.length() && word.endsWith(br)) { + if (spell(word.substring(0, word.length() - br.length()))) { + return true; + } + } + } + + for (String br : dictionary.breaks.middle) { + int pos = word.indexOf(br); + if (canBeBrokenAt(word, br, pos)) { + return true; + } + + // try to break at the second occurrence + // to recognize dictionary words with a word break + if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) { + return true; + } + } + return false; + } + + private boolean hasTooManyBreakOccurrences(String word) { + int occurrences = 0; + for (String br : dictionary.breaks.middle) { + int pos = 0; + while ((pos = word.indexOf(br, pos)) >= 0) { + if (++occurrences >= 10) return true; + pos += br.length(); + } + } + return false; + } + + private boolean canBeBrokenAt(String word, String breakStr, int breakPos) { + return breakPos > 0 + && breakPos < word.length() - breakStr.length() + && spell(word.substring(0, breakPos)) + && spell(word.substring(breakPos + breakStr.length())); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 0e06d3df1ad..10ae99202ef 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -64,7 +64,7 @@ final class Stemmer { suffixReaders[level] = dictionary.suffixes.getBytesReader(); } } - formStep = dictionary.hasStemExceptions ? 2 : 1; + formStep = dictionary.formStep(); } /** diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java new file mode 100644 index 00000000000..7be4eafa8d8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import java.io.InputStream; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Objects; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.util.IOUtils; + +public class SpellCheckerTest extends StemmerTestBase { + + public void testBreak() throws Exception { + doTest("break"); + } + + public void testBreakDefault() throws Exception { + doTest("breakdefault"); + } + + public void testBreakOff() throws Exception { + doTest("breakoff"); + } + + protected void doTest(String name) throws Exception { + InputStream affixStream = + Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name); + InputStream dictStream = + Objects.requireNonNull(getClass().getResourceAsStream(name + ".dic"), name); + + SpellChecker speller; + try { + Dictionary dictionary = + new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream); + speller = new SpellChecker(dictionary); + } finally { + IOUtils.closeWhileHandlingException(affixStream); + IOUtils.closeWhileHandlingException(dictStream); + } + + URL good = StemmerTestBase.class.getResource(name + ".good"); + if (good != null) { + for (String word : Files.readAllLines(Path.of(good.toURI()))) { + assertTrue("Unexpectedly considered misspelled: " + word, speller.spell(word)); + } + } + + URL wrong = StemmerTestBase.class.getResource(name + ".wrong"); + if (wrong != null) { + for (String word : Files.readAllLines(Path.of(wrong.toURI()))) { + assertFalse("Unexpectedly considered correct: " + word, speller.spell(word)); + } + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff new file mode 100644 index 00000000000..55d0609a3b3 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff @@ -0,0 +1,10 @@ +# word break points test, recursive break at dash and n-dash +SET UTF-8 + +BREAK 2 +BREAK - +BREAK – + +WORDCHARS -– + +FORBIDDENWORD ! diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic new file mode 100644 index 00000000000..b2c5741b483 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic @@ -0,0 +1,7 @@ +6 +foo +bar +baz +fox-bax +foo-baz/! +e-mail diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good new file mode 100644 index 00000000000..d651a63a50e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good @@ -0,0 +1,12 @@ +foo +bar +fox-bax +foo-bar +foo–bar +foo-bar-foo-bar +foo-bar–foo-bar +bar-baz +baz-foo +foo-bar-foo-bar-foo-bar-foo-bar-foo-bar +e-mail +e-mail-foo diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong new file mode 100644 index 00000000000..d03b4023469 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong @@ -0,0 +1,13 @@ +fox +bax +-foo +bar- +fox-bar +foo-bax +foo–bax +fox–bar +foo-bar-fox-bar +foo-bax-foo-bar +foo-bar–fox-bar +foo-bax–foo-bar +foo-baz diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff new file mode 100644 index 00000000000..a13f464a60f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.aff @@ -0,0 +1,6 @@ +# default word break at hyphens and n-dashes + +SET UTF-8 +MAXNGRAMSUGS 0 +WORDCHARS - +TRY ot diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic new file mode 100644 index 00000000000..bf29960357e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.dic @@ -0,0 +1,6 @@ +3 +foo +bar +free +scott +scot-free diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good new file mode 100644 index 00000000000..8d812545713 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.good @@ -0,0 +1,7 @@ +foo +bar +foo- +-foo +scot-free +foo-bar +foo-bar-foo-bar diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong new file mode 100644 index 00000000000..e070c5c07e8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakdefault.wrong @@ -0,0 +1,6 @@ +scot +sco-free +fo-bar +foo-fo-bar +foo-foo-fo +- \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff new file mode 100644 index 00000000000..2e83d380231 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.aff @@ -0,0 +1,7 @@ +# switch off default word break at hyphens and n-dashes by BREAK 0 +SET UTF-8 +MAXNGRAMSUGS 0 +WORDCHARS - +TRY ot + +BREAK 0 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic new file mode 100644 index 00000000000..bf29960357e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.dic @@ -0,0 +1,6 @@ +3 +foo +bar +free +scott +scot-free diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good new file mode 100644 index 00000000000..854b39efad2 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.good @@ -0,0 +1,3 @@ +foo +bar +scot-free diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong new file mode 100644 index 00000000000..a6fcf7f1e21 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/breakoff.wrong @@ -0,0 +1,5 @@ +foo- +-foo +foo-bar +foo-bar-foo-bar +scot