Some cleanups in NoisyChannelSpellChecker (#40949)
One of the two #getCorrections methods is only used in tests, so we can move it and any of the required helper methods to that test. Also reducing the visibility of several methods to package private since the class isn't used elsewhere outside the package.
This commit is contained in:
parent
711d2545aa
commit
badb7a22e0
|
@ -18,48 +18,34 @@
|
|||
*/
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
|
||||
|
||||
import java.io.CharArrayReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
//TODO public for tests
|
||||
public final class NoisyChannelSpellChecker {
|
||||
final class NoisyChannelSpellChecker {
|
||||
public static final double REAL_WORD_LIKELIHOOD = 0.95d;
|
||||
public static final int DEFAULT_TOKEN_LIMIT = 10;
|
||||
private final double realWordLikelihood;
|
||||
private final boolean requireUnigram;
|
||||
private final int tokenLimit;
|
||||
|
||||
public NoisyChannelSpellChecker() {
|
||||
this(REAL_WORD_LIKELIHOOD);
|
||||
}
|
||||
|
||||
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
|
||||
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
|
||||
}
|
||||
|
||||
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
|
||||
NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
|
||||
this.realWordLikelihood = nonErrorLikelihood;
|
||||
this.requireUnigram = requireUnigram;
|
||||
this.tokenLimit = tokenLimit;
|
||||
|
||||
}
|
||||
|
||||
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
|
||||
Result getCorrections(TokenStream stream, final CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
|
||||
|
||||
final List<CandidateSet> candidateSetsList = new ArrayList<>();
|
||||
|
@ -131,26 +117,12 @@ public final class NoisyChannelSpellChecker {
|
|||
return new Result(bestCandidates, cutoffScore);
|
||||
}
|
||||
|
||||
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, IndexReader reader, String analysisField,
|
||||
WordScorer scorer, float confidence, int gramSize) throws IOException {
|
||||
|
||||
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors,
|
||||
numCorrections, scorer, confidence, gramSize);
|
||||
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field) throws IOException {
|
||||
spare.copyUTF8Bytes(query);
|
||||
return analyzer.tokenStream(field, new CharArrayReader(spare.chars(), 0, spare.length()));
|
||||
}
|
||||
|
||||
public static class Result {
|
||||
static class Result {
|
||||
public static final Result EMPTY = new Result(Correction.EMPTY, Double.MIN_VALUE);
|
||||
public final Correction[] corrections;
|
||||
public final double cutoffScore;
|
||||
|
||||
public Result(Correction[] corrections, double cutoffScore) {
|
||||
private Result(Correction[] corrections, double cutoffScore) {
|
||||
this.corrections = corrections;
|
||||
this.cutoffScore = cutoffScore;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiTerms;
|
||||
|
@ -45,6 +46,7 @@ import org.elasticsearch.search.suggest.Suggester;
|
|||
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
||||
import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
|
||||
|
||||
import java.io.CharArrayReader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -93,11 +95,12 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
|
|||
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestTerms, suggestField, realWordErrorLikelihood,
|
||||
separator);
|
||||
Result checkerResult;
|
||||
try (TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField())) {
|
||||
try (TokenStream stream = tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare,
|
||||
suggestion.getField())) {
|
||||
checkerResult = checker.getCorrections(stream,
|
||||
new MultiCandidateGeneratorWrapper(suggestion.getShardSize(), gens.toArray(new CandidateGenerator[gens.size()])),
|
||||
suggestion.maxErrors(), suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize());
|
||||
}
|
||||
}
|
||||
|
||||
PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore);
|
||||
response.addTerm(resultEntry);
|
||||
|
@ -144,6 +147,11 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
|
|||
return response;
|
||||
}
|
||||
|
||||
private static TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field) throws IOException {
|
||||
spare.copyUTF8Bytes(query);
|
||||
return analyzer.tokenStream(field, new CharArrayReader(spare.chars(), 0, spare.length()));
|
||||
}
|
||||
|
||||
private static PhraseSuggestion.Entry buildResultEntry(SuggestionContext suggestion, CharsRefBuilder spare, double cutoffScore) {
|
||||
spare.copyUTF8Bytes(suggestion.getText());
|
||||
return new PhraseSuggestion.Entry(new Text(spare.toString()), 0, spare.length(), cutoffScore);
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.search.suggest.phrase;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
|
@ -34,6 +35,7 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.MultiTerms;
|
||||
|
@ -42,14 +44,18 @@ import org.apache.lucene.search.spell.SuggestMode;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.io.CharArrayReader;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.DEFAULT_TOKEN_LIMIT;
|
||||
import static org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.REAL_WORD_LIKELIHOOD;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.greaterThan;
|
||||
|
||||
|
@ -113,12 +119,12 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
WordScorer wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR,
|
||||
ir, 0.95, 5);
|
||||
Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
Result result = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 2);
|
||||
Correction[] corrections = result.corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
|
@ -126,7 +132,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
|
||||
assertThat(result.cutoffScore, greaterThan(0d));
|
||||
|
||||
result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
result = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 0, 1);
|
||||
corrections = result.corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
|
@ -134,10 +140,10 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
|
||||
assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));
|
||||
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT);
|
||||
wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
ir, "body", wordScorer, 0, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -149,7 +155,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel"));
|
||||
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f,
|
||||
4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -158,10 +164,10 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
|
||||
|
||||
// Test some of the highlighting corner cases
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT);
|
||||
wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -195,17 +201,17 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
spellchecker.setAccuracy(0.0f);
|
||||
spellchecker.setMinPrefix(1);
|
||||
spellchecker.setMinQueryLength(1);
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT);
|
||||
wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4,
|
||||
corrections = getCorrections(suggester, analyzer, new BytesRef("captian usa"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
|
||||
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
|
||||
10, null, analyzer, MultiTerms.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4,
|
||||
corrections = getCorrections(suggester, analyzer, new BytesRef("captian usw"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
|
||||
|
@ -213,7 +219,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
// Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
|
||||
10, null, analyzer, MultiTerms.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir,
|
||||
corrections = getCorrections(suggester, analyzer, new BytesRef("captain usw"), generator, 2, 4, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
|
||||
|
@ -282,7 +288,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
DirectoryReader ir = DirectoryReader.open(writer);
|
||||
LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir,
|
||||
|
@ -291,27 +297,27 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
0.95, 10, wrapper, wrapper, MultiTerms.getTerms(ir, "body_reverse"));
|
||||
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
|
||||
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1,
|
||||
Correction[] corrections = getCorrections(suggester, wrapper, new BytesRef("american cae"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("american cae"), forward, 1, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("america cae"), generator, 2, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir,
|
||||
"body", wordScorer, 0, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -319,18 +325,18 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir,
|
||||
"body", wordScorer, 1.5f, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
|
||||
"body", wordScorer, 1.5f, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
// Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
|
||||
|
@ -391,24 +397,24 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir,
|
||||
0.95, 5);
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
Correction[] corrections = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 1).corrections;
|
||||
assertThat(corrections.length, equalTo(0));
|
||||
// assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));
|
||||
|
||||
wordScorer = new LinearInterpolatingScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
ir, "body", wordScorer, 0, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -419,7 +425,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -428,7 +434,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1,
|
||||
ir, "body", wordScorer, 100, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -456,23 +462,23 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
spellchecker.setAccuracy(0.0f);
|
||||
spellchecker.setMinPrefix(1);
|
||||
spellchecker.setMinQueryLength(1);
|
||||
suggester = new NoisyChannelSpellChecker(0.95);
|
||||
suggester = new NoisyChannelSpellChecker(0.95, true, DEFAULT_TOKEN_LIMIT);
|
||||
wordScorer = new LinearInterpolatingScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
|
||||
new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4,
|
||||
corrections = getCorrections(suggester, analyzer, new BytesRef("captian usa"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95,
|
||||
10, null, analyzer, MultiTerms.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4,
|
||||
corrections = getCorrections(suggester, analyzer, new BytesRef("captian usw"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
|
||||
|
||||
wordScorer = new StupidBackoffScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.4);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2,
|
||||
corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2,
|
||||
ir, "body", wordScorer, 0, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(2));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
@ -494,11 +500,11 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
try (DirectoryReader ir = DirectoryReader.open(dir)) {
|
||||
WordScorer wordScorer = new StupidBackoffScorer(ir, MultiTerms.getTerms(ir, "field"), "field", 0.95d,
|
||||
new BytesRef(" "), 0.4f);
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT);
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field",
|
||||
SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
|
||||
Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1,
|
||||
Result result = getCorrections(suggester, new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1,
|
||||
ir, "field", wordScorer, 1, 2);
|
||||
assertThat(result.corrections.length, equalTo(1));
|
||||
assertThat(result.corrections[0].join(space).utf8ToString(), equalTo("value"));
|
||||
|
@ -506,4 +512,13 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private Result getCorrections(NoisyChannelSpellChecker checker, Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence,
|
||||
int gramSize) throws IOException {
|
||||
CharsRefBuilder spare = new CharsRefBuilder();
|
||||
spare.copyUTF8Bytes(query);
|
||||
TokenStream tokenStream = analyzer.tokenStream(analysisField, new CharArrayReader(spare.chars(), 0, spare.length()));
|
||||
return checker.getCorrections(tokenStream, generator, maxErrors, numCorrections, scorer, confidence, gramSize);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue