Recheck cutoffScore during phrase_suggest merge.

The goal is to throw out suggestions that only meet the cutoff in some
shards.  This will happen if your input phrase is only contained in a
few shards.  If your shards are unbanced this rechecking can throw out
good suggestions.

Closes #3547.
This commit is contained in:
Nik Everett 2013-08-21 11:05:53 -04:00 committed by Simon Willnauer
parent 76939b82d3
commit 10e55bd3ef
6 changed files with 279 additions and 57 deletions

View File

@ -31,6 +31,7 @@ import org.elasticsearch.common.xcontent.XContentBuilderString;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
import org.elasticsearch.search.suggest.phrase.PhraseSuggestion;
import org.elasticsearch.search.suggest.term.TermSuggestion;
import java.io.IOException;
@ -119,6 +120,9 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex
case CompletionSuggestion.TYPE:
suggestion = new CompletionSuggestion();
break;
case PhraseSuggestion.TYPE:
suggestion = new PhraseSuggestion();
break;
default:
suggestion = new Suggestion<Entry<Option>>();
break;
@ -357,7 +361,7 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex
CollectionUtil.timSort(options, comparator);
}
protected Entry<O> reduce(List<Entry<O>> toReduce) {
protected Entry<O> reduce(List<? extends Entry<O>> toReduce) {
if (toReduce.size() == 1) {
return toReduce.get(0);
}
@ -367,20 +371,29 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex
assert leader.text.equals(entry.text);
assert leader.offset == entry.offset;
assert leader.length == entry.length;
leader.merge(entry);
for (O option : entry) {
O merger = entries.get(option);
if (merger == null) {
entries.put(option, option);
entries.put(option, option);
} else {
merger.mergeInto(option);
}
}
}
}
leader.options.clear();
leader.options.addAll(entries.keySet());
for (O option: entries.keySet()) {
leader.addOption(option);
}
return leader;
}
/**
* Merge any extra fields for this subtype.
*/
protected void merge(Entry<O> other) {
}
/**
* @return the text (analyzed by suggest analyzer) originating from the suggest text. Usually this is a
* single term.

View File

@ -18,10 +18,6 @@
*/
package org.elasticsearch.search.suggest.phrase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
@ -36,6 +32,10 @@ import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
//TODO public for tests
public final class NoisyChannelSpellChecker {
public static final double REAL_WORD_LIKELYHOOD = 0.95d;
@ -59,7 +59,7 @@ public final class NoisyChannelSpellChecker {
}
public Correction[] getCorrections(TokenStream stream, final CandidateGenerator generator,
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
float maxErrors, int numCorrections, IndexReader reader, WordScorer wordScorer, BytesRef separator, float confidence, int gramSize) throws IOException {
final List<CandidateSet> candidateSetsList = new ArrayList<DirectCandidateGenerator.CandidateSet>();
@ -109,7 +109,7 @@ public final class NoisyChannelSpellChecker {
});
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Correction.EMPTY;
return Result.EMPTY;
}
for (CandidateSet candidateSet : candidateSetsList) {
@ -123,14 +123,15 @@ public final class NoisyChannelSpellChecker {
for (int i = 0; i < candidates.length; i++) {
candidates[i] = candidateSets[i].originalTerm;
}
cutoffScore = scorer.score(candidates, candidateSets);
double inputPhraseScore = scorer.score(candidates, candidateSets);
cutoffScore = inputPhraseScore * confidence;
}
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore * confidence);
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return findBestCandiates;
return new Result(findBestCandiates, cutoffScore);
}
public Correction[] getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
return getCorrections(tokenStream(analyzer, query, new CharsRef(), analysisField), generator, maxErrors, numCorrections, reader, scorer, new BytesRef(" "), confidence, gramSize);
@ -141,6 +142,15 @@ public final class NoisyChannelSpellChecker {
UnicodeUtil.UTF8toUTF16(query, spare);
return analyzer.tokenStream(field, new FastCharArrayReader(spare.chars, spare.offset, spare.length));
}
public static class Result {
public static final Result EMPTY = new Result(Correction.EMPTY, Double.MIN_VALUE);
public final Correction[] corrections;
public final double cutoffScore;
public Result(Correction[] corrections, double cutoffScore) {
this.corrections = corrections;
this.cutoffScore = cutoffScore;
}
}
}

View File

@ -33,14 +33,12 @@ import org.elasticsearch.search.suggest.Suggest.Suggestion;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
import org.elasticsearch.search.suggest.*;
import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.io.IOException;
import java.util.List;
public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
private final BytesRef SEPARATOR = new BytesRef(" ");
@ -56,11 +54,8 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
public Suggestion<? extends Entry<? extends Option>> innerExecute(String name, PhraseSuggestionContext suggestion,
IndexReader indexReader, CharsRef spare) throws IOException {
double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood();
UnicodeUtil.UTF8toUTF16(suggestion.getText(), spare);
Suggestion.Entry<Option> resultEntry = new Suggestion.Entry<Option>(new StringText(spare.toString()), 0, spare.length);
final Suggestion<Entry<Option>> response = new Suggestion<Entry<Option>>(name, suggestion.getSize());
response.addTerm(resultEntry);
final PhraseSuggestion response = new PhraseSuggestion(name, suggestion.getSize());
List<PhraseSuggestionContext.DirectCandidateGenerator> generators = suggestion.generators();
final int numGenerators = generators.size();
final List<CandidateGenerator> gens = new ArrayList<CandidateGenerator>(generators.size());
@ -81,12 +76,15 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField());
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator);
Correction[] corrections = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(),
Result checkerResult = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(),
gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(),
suggestion.getShardSize(), indexReader,wordScorer , separator, suggestion.confidence(), suggestion.gramSize());
PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore);
response.addTerm(resultEntry);
BytesRef byteSpare = new BytesRef();
for (Correction correction : corrections) {
for (Correction correction : checkerResult.corrections) {
UnicodeUtil.UTF8toUTF16(correction.join(SEPARATOR, byteSpare, null, null), spare);
Text phrase = new StringText(spare.toString());
Text highlighted = null;
@ -96,9 +94,16 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
}
resultEntry.addOption(new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score)));
}
} else {
response.addTerm(buildResultEntry(suggestion, spare, Double.MIN_VALUE));
}
return response;
}
private PhraseSuggestion.Entry buildResultEntry(PhraseSuggestionContext suggestion, CharsRef spare, double cutoffScore) {
UnicodeUtil.UTF8toUTF16(suggestion.getText(), spare);
return new PhraseSuggestion.Entry(new StringText(spare.toString()), 0, spare.length, cutoffScore);
}
@Override
public String[] names() {

View File

@ -0,0 +1,120 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.phrase;
import org.elasticsearch.Version;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.xcontent.XContentBuilderString;
import org.elasticsearch.search.suggest.Suggest;
import org.elasticsearch.search.suggest.Suggest.Suggestion;
import java.io.IOException;
/**
* Suggestion entry returned from the {@link PhraseSuggester}.
*/
public class PhraseSuggestion extends Suggest.Suggestion<PhraseSuggestion.Entry> {
public static final int TYPE = 3;
public PhraseSuggestion() {
}
public PhraseSuggestion(String name, int size) {
super(name, size);
}
@Override
public int getType() {
return TYPE;
}
@Override
protected Entry newEntry() {
return new Entry();
}
public static class Entry extends Suggestion.Entry<Suggestion.Entry.Option> {
static class Fields {
static final XContentBuilderString CUTOFF_SCORE = new XContentBuilderString("cutoff_score");
}
protected double cutoffScore = Double.MIN_VALUE;
public Entry(Text text, int offset, int length, double cutoffScore) {
super(text, offset, length);
this.cutoffScore = cutoffScore;
}
public Entry() {
}
/**
* @return cutoff score for suggestions. input term score * confidence for phrase suggest, 0 otherwise
*/
public double getCutoffScore() {
return cutoffScore;
}
@Override
protected void merge(Suggestion.Entry<Suggestion.Entry.Option> other) {
super.merge(other);
// If the cluster contains both pre 0.90.4 and post 0.90.4 nodes then we'll see Suggestion.Entry
// objects being merged with PhraseSuggestion.Entry objects. We merge Suggestion.Entry objects
// by assuming they had a low cutoff score rather than a high one as that is the more common scenario
// and the simplest one for us to implement.
if (!(other instanceof PhraseSuggestion.Entry)) {
return;
}
PhraseSuggestion.Entry otherSuggestionEntry = (PhraseSuggestion.Entry) other;
this.cutoffScore = Math.max(this.cutoffScore, otherSuggestionEntry.cutoffScore);
}
@Override
public void addOption(Suggestion.Entry.Option option) {
if (option.getScore() > this.cutoffScore) {
this.options.add(option);
}
}
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
// If the other side is older than 0.90.4 then it shouldn't be sending suggestions of this type but just in case
// we're going to assume that they are regular suggestions so we won't read anything.
if (in.getVersion().before(Version.V_0_90_4)) {
return;
}
cutoffScore = in.readDouble();
}
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
// If the other side of the message is older than 0.90.4 it'll interpret these suggestions as regular suggestions
// so we have to pretend to be one which we can do by just calling the superclass writeTo and doing nothing else
if (out.getVersion().before(Version.V_0_90_4)) {
return;
}
out.writeDouble(cutoffScore);
}
}
}

View File

@ -40,9 +40,7 @@ import org.junit.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
import java.util.concurrent.ExecutionException;
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
@ -165,7 +163,7 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
public void testUnmappedField() throws IOException, InterruptedException, ExecutionException {
int numShards = between(1,5);
Builder builder = ImmutableSettings.builder();
builder.put("index.number_of_shards", numShards).put("index.number_of_replicas", between(0, 2));
builder.put("index.number_of_shards", numShards).put("index.number_of_replicas", between(0, numberOfNodes() - 1));
builder.put("index.analysis.analyzer.biword.tokenizer", "standard");
builder.putArray("index.analysis.analyzer.biword.filter", "shingler", "lowercase");
builder.put("index.analysis.filter.shingler.type", "shingle");
@ -1149,7 +1147,7 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
@Test // see #3469
public void testShardFailures() throws IOException, InterruptedException {
Builder builder = ImmutableSettings.builder();
builder.put("index.number_of_shards", between(1, 5)).put("index.number_of_replicas", between(0, 2));
builder.put("index.number_of_shards", between(1, 5)).put("index.number_of_replicas", between(0, numberOfNodes() - 1));
builder.put("index.analysis.analyzer.suggest.tokenizer", "standard");
builder.putArray("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler");
builder.put("index.analysis.filter.shingler.type", "shingle");
@ -1187,7 +1185,8 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
client().prepareIndex("test", "type1", "1")
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "Just testing the suggestions api").endObject()).execute().actionGet();
client().prepareIndex("test", "type1", "2")
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title").endObject()).execute().actionGet();
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title about equal length").endObject()).execute().actionGet();
// Note that the last document has to have about the same length as the other or cutoff rechecking will remove the useful suggestion.
client().admin().indices().prepareRefresh().execute().actionGet();
// When searching on a shard with a non existing mapping, we should fail
@ -1240,7 +1239,8 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
client().prepareIndex("test", "type1", "1")
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "Just testing the suggestions api").endObject()).execute().actionGet();
client().prepareIndex("test", "type1", "2")
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title").endObject()).execute().actionGet();
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title about equal length").endObject()).execute().actionGet();
// Note that the last document has to have about the same length as the other or cutoff rechecking will remove the useful suggestion.
client().admin().indices().prepareRefresh().execute().actionGet();
SearchRequestBuilder suggestBuilder = client().prepareSearch().setSearchType(SearchType.COUNT);
@ -1251,4 +1251,72 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
ElasticsearchAssertions.assertNoFailures(searchResponse);
ElasticsearchAssertions.assertSuggestion(searchResponse.getSuggest(), 0, 0, "did_you_mean", "testing suggestions");
}
/**
* Searching for a rare phrase shouldn't provide any suggestions if confidence > 1. This was possible before we rechecked the cutoff
* score during the reduce phase. Failures don't occur every time - maybe two out of five tries but we don't repeat it to save time.
*/
@Test
public void testSearchForRarePhrase() throws ElasticSearchException, IOException {
// If there isn't enough chaf per shard then shards can become unbalanced, making the cutoff recheck this is testing do more harm then good.
int chafPerShard = 100;
Builder builder = ImmutableSettings.builder();
int numberOfShards = between(2, 5);
builder.put("index.number_of_shards", numberOfShards).put("index.number_of_replicas", between(0, numberOfNodes() - 1));
builder.put("index.analysis.analyzer.body.tokenizer", "standard");
builder.putArray("index.analysis.analyzer.body.filter", "lowercase", "my_shingle");
builder.put("index.analysis.filter.my_shingle.type", "shingle");
builder.put("index.analysis.filter.my_shingle.output_unigrams", true);
builder.put("index.analysis.filter.my_shingle.min_shingle_size", 2);
builder.put("index.analysis.filter.my_shingle.max_shingle_size", 2);
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
.startObject("_all").field("store", "yes").field("termVector", "with_positions_offsets").endObject()
.startObject("properties")
.startObject("body").field("type", "string").field("analyzer", "body").endObject()
.endObject()
.endObject().endObject();
client().admin().indices().prepareCreate("test").setSettings(builder.build()).addMapping("type1", mapping).execute().actionGet();
ensureGreen();
List<String> phrases = new ArrayList<String>();
Collections.addAll(phrases, "nobel prize", "noble gases", "somethingelse prize", "pride and joy", "notes are fun");
for (int i = 0; i < 8; i++) {
phrases.add("noble somethingelse" + i);
}
for (int i = 0; i < numberOfShards * chafPerShard; i++) {
phrases.add("chaff" + i);
}
for (String phrase: phrases) {
client().prepareIndex("test", "type1")
.setSource(XContentFactory.jsonBuilder()
.startObject()
.field("body", phrase)
.endObject()
)
.execute().actionGet();
}
refresh();
Suggest searchSuggest = searchSuggest(client(), "nobel prize", phraseSuggestion("simple_phrase")
.field("body")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always").maxTermFreq(.99f))
.confidence(2f)
.maxErrors(5f)
.size(1));
ElasticsearchAssertions.assertSuggestionSize(searchSuggest, 0, 0, "simple_phrase");
searchSuggest = searchSuggest(client(), "noble prize", phraseSuggestion("simple_phrase")
.field("body")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always").maxTermFreq(.99f))
.confidence(2f)
.maxErrors(5f)
.size(1));
ElasticsearchAssertions.assertSuggestion(searchSuggest, 0, 0, "simple_phrase", "nobel prize");
}
@Override
protected int numberOfNodes() {
return 3;
}
}

View File

@ -42,6 +42,7 @@ import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.elasticsearch.search.suggest.phrase.*;
import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
import org.elasticsearch.test.integration.ElasticsearchTestCase;
import org.junit.Test;
@ -50,6 +51,7 @@ import java.util.HashMap;
import java.util.Map;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
private final BytesRef space = new BytesRef(" ");
private final BytesRef preTag = new BytesRef("<em>");
@ -100,19 +102,23 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
Correction[] corrections = result.corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
assertThat(result.cutoffScore, greaterThan(0d));
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
corrections = result.corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));
suggester = new NoisyChannelSpellChecker(0.85);
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
@ -123,7 +129,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel"));
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
@ -133,7 +139,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
// Test some of the highlighting corner cases
suggester = new NoisyChannelSpellChecker(0.85);
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
@ -168,18 +174,18 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
spellchecker.setMinQueryLength(1);
suggester = new NoisyChannelSpellChecker(0.85);
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
// Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
}
@ -241,23 +247,23 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
@ -266,11 +272,11 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
@ -323,16 +329,16 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1);
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections;
assertThat(corrections.length, equalTo(0));
// assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));
wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
@ -342,7 +348,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
@ -350,7 +356,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
@ -379,16 +385,16 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
spellchecker.setMinQueryLength(1);
suggester = new NoisyChannelSpellChecker(0.95);
wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3);
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3);
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections;
assertThat(corrections.length, equalTo(2));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));