diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java index 7f225f1c3ea..9612d29f4f5 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java @@ -18,48 +18,34 @@ */ package org.elasticsearch.search.suggest.phrase; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.codecs.TermStats; -import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.CharsRefBuilder; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; -import java.io.CharArrayReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; -//TODO public for tests -public final class NoisyChannelSpellChecker { +final class NoisyChannelSpellChecker { public static final double REAL_WORD_LIKELIHOOD = 0.95d; public static final int DEFAULT_TOKEN_LIMIT = 10; private final double realWordLikelihood; private final boolean requireUnigram; private final int tokenLimit; - public NoisyChannelSpellChecker() { - this(REAL_WORD_LIKELIHOOD); - } - - public NoisyChannelSpellChecker(double nonErrorLikelihood) { - this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT); - } - - public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) { + NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) { this.realWordLikelihood = nonErrorLikelihood; this.requireUnigram = requireUnigram; this.tokenLimit = tokenLimit; - } - public Result getCorrections(TokenStream stream, final CandidateGenerator generator, + Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException { final List candidateSetsList = new ArrayList<>(); @@ -131,26 +117,12 @@ public final class NoisyChannelSpellChecker { return new Result(bestCandidates, cutoffScore); } - public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator, - float maxErrors, int numCorrections, IndexReader reader, String analysisField, - WordScorer scorer, float confidence, int gramSize) throws IOException { - - return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, - numCorrections, scorer, confidence, gramSize); - - } - - public TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field) throws IOException { - spare.copyUTF8Bytes(query); - return analyzer.tokenStream(field, new CharArrayReader(spare.chars(), 0, spare.length())); - } - - public static class Result { + static class Result { public static final Result EMPTY = new Result(Correction.EMPTY, Double.MIN_VALUE); public final Correction[] corrections; public final double cutoffScore; - public Result(Correction[] corrections, double cutoffScore) { + private Result(Correction[] corrections, double cutoffScore) { this.corrections = corrections; this.cutoffScore = cutoffScore; } diff --git a/server/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java b/server/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java index 413afd155d4..d80fd68dacb 100644 --- a/server/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java +++ b/server/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.suggest.phrase; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiTerms; @@ -45,6 +46,7 @@ import org.elasticsearch.search.suggest.Suggester; import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext; import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result; +import java.io.CharArrayReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -93,11 +95,12 @@ public final class PhraseSuggester extends Suggester { WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator); Result checkerResult; - try (TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField())) { + try (TokenStream stream = tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, + suggestion.getField())) { checkerResult = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(), gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(), suggestion.getShardSize(), wordScorer, suggestion.confidence(), suggestion.gramSize()); - } + } PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore); response.addTerm(resultEntry); @@ -144,6 +147,11 @@ public final class PhraseSuggester extends Suggester { return response; } + private static TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field) throws IOException { + spare.copyUTF8Bytes(query); + return analyzer.tokenStream(field, new CharArrayReader(spare.chars(), 0, spare.length())); + } + private static PhraseSuggestion.Entry buildResultEntry(SuggestionContext suggestion, CharsRefBuilder spare, double cutoffScore) { spare.copyUTF8Bytes(suggestion.getText()); return new PhraseSuggestion.Entry(new Text(spare.toString()), 0, spare.length(), cutoffScore); diff --git a/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java b/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java index d819d880c86..94c5bf329eb 100644 --- a/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java +++ b/server/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.suggest.phrase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; @@ -34,6 +35,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.MultiTerms; @@ -42,14 +44,18 @@ import org.apache.lucene.search.spell.SuggestMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRefBuilder; import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result; import org.elasticsearch.test.ESTestCase; +import java.io.CharArrayReader; import java.io.IOException; import java.io.StringReader; import java.util.HashMap; import java.util.Map; +import static org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.DEFAULT_TOKEN_LIMIT; +import static org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.REAL_WORD_LIKELIHOOD; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; @@ -113,12 +119,12 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { WordScorer wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); - NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); + NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + Result result = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); Correction[] corrections = result.corrections; assertThat(corrections.length, equalTo(1)); @@ -126,7 +132,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ace")); assertThat(result.cutoffScore, greaterThan(0d)); - result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + result = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1); corrections = result.corrections; assertThat(corrections.length, equalTo(1)); @@ -134,10 +140,10 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame")); assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE)); - suggester = new NoisyChannelSpellChecker(0.85); + suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT); wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); @@ -149,7 +155,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xorr the got jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); @@ -158,10 +164,10 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel")); // Test some of the highlighting corner cases - suggester = new NoisyChannelSpellChecker(0.85); + suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT); wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); @@ -195,17 +201,17 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); - suggester = new NoisyChannelSpellChecker(0.85); + suggester = new NoisyChannelSpellChecker(0.85, true, DEFAULT_TOKEN_LIMIT); wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, + corrections = getCorrections(suggester, analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiTerms.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, + corrections = getCorrections(suggester, analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); @@ -213,7 +219,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiTerms.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, + corrections = getCorrections(suggester, analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain america")); @@ -282,7 +288,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { DirectoryReader ir = DirectoryReader.open(writer); LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); - NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); + NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, @@ -291,27 +297,27 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { 0.95, 10, wrapper, wrapper, MultiTerms.getTerms(ir, "body_reverse")); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse); - Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, + Correction[] corrections = getCorrections(suggester, wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); generator = new MultiCandidateGeneratorWrapper(5, forward, reverse); - corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, + corrections = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, + corrections = getCorrections(suggester, wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix - corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, + corrections = getCorrections(suggester, wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, + corrections = getCorrections(suggester, wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); @@ -319,18 +325,18 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, + corrections = getCorrections(suggester, wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); // Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer. - corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, + corrections = getCorrections(suggester, wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii")); @@ -391,24 +397,24 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); - NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); + NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + Correction[] corrections = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); - corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, + corrections = getCorrections(suggester, wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections; assertThat(corrections.length, equalTo(0)); // assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape")); wordScorer = new LinearInterpolatingScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); @@ -419,7 +425,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); @@ -428,7 +434,7 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); @@ -456,23 +462,23 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); - suggester = new NoisyChannelSpellChecker(0.95); + suggester = new NoisyChannelSpellChecker(0.95, true, DEFAULT_TOKEN_LIMIT); wordScorer = new LinearInterpolatingScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, + corrections = getCorrections(suggester, analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiTerms.getTerms(ir, "body")); - corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, + corrections = getCorrections(suggester, analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); wordScorer = new StupidBackoffScorer(ir, MultiTerms.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4); - corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, + corrections = getCorrections(suggester, wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(2)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); @@ -494,11 +500,11 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { try (DirectoryReader ir = DirectoryReader.open(dir)) { WordScorer wordScorer = new StupidBackoffScorer(ir, MultiTerms.getTerms(ir, "field"), "field", 0.95d, new BytesRef(" "), 0.4f); - NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); + NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(REAL_WORD_LIKELIHOOD, true, DEFAULT_TOKEN_LIMIT); DirectSpellChecker spellchecker = new DirectSpellChecker(); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); - Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, + Result result = getCorrections(suggester, new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, ir, "field", wordScorer, 1, 2); assertThat(result.corrections.length, equalTo(1)); assertThat(result.corrections[0].join(space).utf8ToString(), equalTo("value")); @@ -506,4 +512,13 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase { } } + private Result getCorrections(NoisyChannelSpellChecker checker, Analyzer analyzer, BytesRef query, CandidateGenerator generator, + float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, + int gramSize) throws IOException { + CharsRefBuilder spare = new CharsRefBuilder(); + spare.copyUTF8Bytes(query); + TokenStream tokenStream = analyzer.tokenStream(analysisField, new CharArrayReader(spare.chars(), 0, spare.length())); + return checker.getCorrections(tokenStream, generator, maxErrors, numCorrections, scorer, confidence, gramSize); + } + }