LUCENE-9749: Hunspell: apply output conversion (OCONV) to the suggestions (#2329)

This commit is contained in:
Peter Gromov 2021-02-10 09:17:44 +01:00 committed by GitHub
parent f2b7cdc491
commit 5fd18881e9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 51 additions and 1 deletions

View File

@ -22,12 +22,14 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
@ -438,7 +440,7 @@ public class SpellChecker {
result.add(candidate);
}
}
return new ArrayList<>(result);
return result.stream().map(this::cleanOutput).collect(Collectors.toList());
}
private String adjustSuggestionCase(String candidate, WordCase original) {
@ -480,4 +482,16 @@ public class SpellChecker {
}
return result;
}
private String cleanOutput(String s) {
if (!dictionary.needsOutputCleaning) return s;
try {
StringBuilder sb = new StringBuilder(s);
Dictionary.applyMappings(dictionary.oconv, sb);
return sb.toString();
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
}

View File

@ -172,6 +172,10 @@ public class SpellCheckerTest extends StemmerTestBase {
doTest("germancompounding");
}
public void testApplyOconvToSuggestions() throws Exception {
doTest("oconv");
}
public void testModifyingSuggestions() throws Exception {
doTest("sug");
}

View File

@ -0,0 +1,20 @@
# output conversion
SET UTF-8
# Testing also whitespace and comments.
OCONV 7 # space, space
OCONV a A # tab, space, space
OCONV á Á # tab, tab, space
OCONV b B # tab, tab, tab
OCONV c C # 2xspace, 2xspace, 2xtab
OCONV d D # tab+space, space+tab, space
OCONV e E #
OCONV é É
# Only comment. Note that line above ends with space+tab.
# space
# 2xspace
# tab
# 2xtab
# space+tab
# tab+space

View File

@ -0,0 +1,4 @@
3
bébé
dádá
aábcdeé

View File

@ -0,0 +1,2 @@
bébé
dádá

View File

@ -0,0 +1,3 @@
BÉBÉ
DÁDÁ
AÁBCDEÉ

View File

@ -0,0 +1,3 @@
béb
dád
aábcde