Recheck cutoffScore during phrase_suggest merge.
The goal is to throw out suggestions that only meet the cutoff in some shards. This will happen if your input phrase is only contained in a few shards. If your shards are unbanced this rechecking can throw out good suggestions. Closes #3547.
This commit is contained in:
parent
76939b82d3
commit
10e55bd3ef
|
@ -31,6 +31,7 @@ import org.elasticsearch.common.xcontent.XContentBuilderString;
|
|||
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
|
||||
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
|
||||
import org.elasticsearch.search.suggest.phrase.PhraseSuggestion;
|
||||
import org.elasticsearch.search.suggest.term.TermSuggestion;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -119,6 +120,9 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex
|
|||
case CompletionSuggestion.TYPE:
|
||||
suggestion = new CompletionSuggestion();
|
||||
break;
|
||||
case PhraseSuggestion.TYPE:
|
||||
suggestion = new PhraseSuggestion();
|
||||
break;
|
||||
default:
|
||||
suggestion = new Suggestion<Entry<Option>>();
|
||||
break;
|
||||
|
@ -357,7 +361,7 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex
|
|||
CollectionUtil.timSort(options, comparator);
|
||||
}
|
||||
|
||||
protected Entry<O> reduce(List<Entry<O>> toReduce) {
|
||||
protected Entry<O> reduce(List<? extends Entry<O>> toReduce) {
|
||||
if (toReduce.size() == 1) {
|
||||
return toReduce.get(0);
|
||||
}
|
||||
|
@ -367,20 +371,29 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex
|
|||
assert leader.text.equals(entry.text);
|
||||
assert leader.offset == entry.offset;
|
||||
assert leader.length == entry.length;
|
||||
leader.merge(entry);
|
||||
for (O option : entry) {
|
||||
O merger = entries.get(option);
|
||||
if (merger == null) {
|
||||
entries.put(option, option);
|
||||
entries.put(option, option);
|
||||
} else {
|
||||
merger.mergeInto(option);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
leader.options.clear();
|
||||
leader.options.addAll(entries.keySet());
|
||||
for (O option: entries.keySet()) {
|
||||
leader.addOption(option);
|
||||
}
|
||||
return leader;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Merge any extra fields for this subtype.
|
||||
*/
|
||||
protected void merge(Entry<O> other) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the text (analyzed by suggest analyzer) originating from the suggest text. Usually this is a
|
||||
* single term.
|
||||
|
|
|
@ -18,10 +18,6 @@
|
|||
*/
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
|
@ -36,6 +32,10 @@ import org.elasticsearch.search.suggest.SuggestUtils;
|
|||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
//TODO public for tests
|
||||
public final class NoisyChannelSpellChecker {
|
||||
public static final double REAL_WORD_LIKELYHOOD = 0.95d;
|
||||
|
@ -59,7 +59,7 @@ public final class NoisyChannelSpellChecker {
|
|||
|
||||
}
|
||||
|
||||
public Correction[] getCorrections(TokenStream stream, final CandidateGenerator generator,
|
||||
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, IndexReader reader, WordScorer wordScorer, BytesRef separator, float confidence, int gramSize) throws IOException {
|
||||
|
||||
final List<CandidateSet> candidateSetsList = new ArrayList<DirectCandidateGenerator.CandidateSet>();
|
||||
|
@ -109,7 +109,7 @@ public final class NoisyChannelSpellChecker {
|
|||
});
|
||||
|
||||
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
|
||||
return Correction.EMPTY;
|
||||
return Result.EMPTY;
|
||||
}
|
||||
|
||||
for (CandidateSet candidateSet : candidateSetsList) {
|
||||
|
@ -123,14 +123,15 @@ public final class NoisyChannelSpellChecker {
|
|||
for (int i = 0; i < candidates.length; i++) {
|
||||
candidates[i] = candidateSets[i].originalTerm;
|
||||
}
|
||||
cutoffScore = scorer.score(candidates, candidateSets);
|
||||
double inputPhraseScore = scorer.score(candidates, candidateSets);
|
||||
cutoffScore = inputPhraseScore * confidence;
|
||||
}
|
||||
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore * confidence);
|
||||
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
||||
|
||||
return findBestCandiates;
|
||||
return new Result(findBestCandiates, cutoffScore);
|
||||
}
|
||||
|
||||
public Correction[] getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
|
||||
|
||||
return getCorrections(tokenStream(analyzer, query, new CharsRef(), analysisField), generator, maxErrors, numCorrections, reader, scorer, new BytesRef(" "), confidence, gramSize);
|
||||
|
@ -141,6 +142,15 @@ public final class NoisyChannelSpellChecker {
|
|||
UnicodeUtil.UTF8toUTF16(query, spare);
|
||||
return analyzer.tokenStream(field, new FastCharArrayReader(spare.chars, spare.offset, spare.length));
|
||||
}
|
||||
|
||||
|
||||
public static class Result {
|
||||
public static final Result EMPTY = new Result(Correction.EMPTY, Double.MIN_VALUE);
|
||||
public final Correction[] corrections;
|
||||
public final double cutoffScore;
|
||||
|
||||
public Result(Correction[] corrections, double cutoffScore) {
|
||||
this.corrections = corrections;
|
||||
this.cutoffScore = cutoffScore;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,14 +33,12 @@ import org.elasticsearch.search.suggest.Suggest.Suggestion;
|
|||
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
|
||||
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
|
||||
import org.elasticsearch.search.suggest.*;
|
||||
import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
|
||||
private final BytesRef SEPARATOR = new BytesRef(" ");
|
||||
|
||||
|
@ -56,11 +54,8 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
|
|||
public Suggestion<? extends Entry<? extends Option>> innerExecute(String name, PhraseSuggestionContext suggestion,
|
||||
IndexReader indexReader, CharsRef spare) throws IOException {
|
||||
double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood();
|
||||
UnicodeUtil.UTF8toUTF16(suggestion.getText(), spare);
|
||||
Suggestion.Entry<Option> resultEntry = new Suggestion.Entry<Option>(new StringText(spare.toString()), 0, spare.length);
|
||||
final Suggestion<Entry<Option>> response = new Suggestion<Entry<Option>>(name, suggestion.getSize());
|
||||
response.addTerm(resultEntry);
|
||||
|
||||
final PhraseSuggestion response = new PhraseSuggestion(name, suggestion.getSize());
|
||||
|
||||
List<PhraseSuggestionContext.DirectCandidateGenerator> generators = suggestion.generators();
|
||||
final int numGenerators = generators.size();
|
||||
final List<CandidateGenerator> gens = new ArrayList<CandidateGenerator>(generators.size());
|
||||
|
@ -81,12 +76,15 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
|
|||
TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField());
|
||||
|
||||
WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestTerms, suggestField, realWordErrorLikelihood, separator);
|
||||
Correction[] corrections = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(),
|
||||
Result checkerResult = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(),
|
||||
gens.toArray(new CandidateGenerator[gens.size()])), suggestion.maxErrors(),
|
||||
suggestion.getShardSize(), indexReader,wordScorer , separator, suggestion.confidence(), suggestion.gramSize());
|
||||
|
||||
|
||||
PhraseSuggestion.Entry resultEntry = buildResultEntry(suggestion, spare, checkerResult.cutoffScore);
|
||||
response.addTerm(resultEntry);
|
||||
|
||||
BytesRef byteSpare = new BytesRef();
|
||||
for (Correction correction : corrections) {
|
||||
for (Correction correction : checkerResult.corrections) {
|
||||
UnicodeUtil.UTF8toUTF16(correction.join(SEPARATOR, byteSpare, null, null), spare);
|
||||
Text phrase = new StringText(spare.toString());
|
||||
Text highlighted = null;
|
||||
|
@ -96,9 +94,16 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
|
|||
}
|
||||
resultEntry.addOption(new Suggestion.Entry.Option(phrase, highlighted, (float) (correction.score)));
|
||||
}
|
||||
} else {
|
||||
response.addTerm(buildResultEntry(suggestion, spare, Double.MIN_VALUE));
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
private PhraseSuggestion.Entry buildResultEntry(PhraseSuggestionContext suggestion, CharsRef spare, double cutoffScore) {
|
||||
UnicodeUtil.UTF8toUTF16(suggestion.getText(), spare);
|
||||
return new PhraseSuggestion.Entry(new StringText(spare.toString()), 0, spare.length, cutoffScore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] names() {
|
||||
|
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.text.Text;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilderString;
|
||||
import org.elasticsearch.search.suggest.Suggest;
|
||||
import org.elasticsearch.search.suggest.Suggest.Suggestion;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Suggestion entry returned from the {@link PhraseSuggester}.
|
||||
*/
|
||||
public class PhraseSuggestion extends Suggest.Suggestion<PhraseSuggestion.Entry> {
|
||||
public static final int TYPE = 3;
|
||||
|
||||
public PhraseSuggestion() {
|
||||
}
|
||||
|
||||
public PhraseSuggestion(String name, int size) {
|
||||
super(name, size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getType() {
|
||||
return TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Entry newEntry() {
|
||||
return new Entry();
|
||||
}
|
||||
|
||||
public static class Entry extends Suggestion.Entry<Suggestion.Entry.Option> {
|
||||
static class Fields {
|
||||
static final XContentBuilderString CUTOFF_SCORE = new XContentBuilderString("cutoff_score");
|
||||
}
|
||||
|
||||
protected double cutoffScore = Double.MIN_VALUE;
|
||||
|
||||
public Entry(Text text, int offset, int length, double cutoffScore) {
|
||||
super(text, offset, length);
|
||||
this.cutoffScore = cutoffScore;
|
||||
}
|
||||
|
||||
public Entry() {
|
||||
}
|
||||
|
||||
/**
|
||||
* @return cutoff score for suggestions. input term score * confidence for phrase suggest, 0 otherwise
|
||||
*/
|
||||
public double getCutoffScore() {
|
||||
return cutoffScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void merge(Suggestion.Entry<Suggestion.Entry.Option> other) {
|
||||
super.merge(other);
|
||||
// If the cluster contains both pre 0.90.4 and post 0.90.4 nodes then we'll see Suggestion.Entry
|
||||
// objects being merged with PhraseSuggestion.Entry objects. We merge Suggestion.Entry objects
|
||||
// by assuming they had a low cutoff score rather than a high one as that is the more common scenario
|
||||
// and the simplest one for us to implement.
|
||||
if (!(other instanceof PhraseSuggestion.Entry)) {
|
||||
return;
|
||||
}
|
||||
PhraseSuggestion.Entry otherSuggestionEntry = (PhraseSuggestion.Entry) other;
|
||||
this.cutoffScore = Math.max(this.cutoffScore, otherSuggestionEntry.cutoffScore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addOption(Suggestion.Entry.Option option) {
|
||||
if (option.getScore() > this.cutoffScore) {
|
||||
this.options.add(option);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
super.readFrom(in);
|
||||
// If the other side is older than 0.90.4 then it shouldn't be sending suggestions of this type but just in case
|
||||
// we're going to assume that they are regular suggestions so we won't read anything.
|
||||
if (in.getVersion().before(Version.V_0_90_4)) {
|
||||
return;
|
||||
}
|
||||
cutoffScore = in.readDouble();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
super.writeTo(out);
|
||||
// If the other side of the message is older than 0.90.4 it'll interpret these suggestions as regular suggestions
|
||||
// so we have to pretend to be one which we can do by just calling the superclass writeTo and doing nothing else
|
||||
if (out.getVersion().before(Version.V_0_90_4)) {
|
||||
return;
|
||||
}
|
||||
out.writeDouble(cutoffScore);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -40,9 +40,7 @@ import org.junit.Test;
|
|||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
|
||||
|
@ -165,7 +163,7 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
|
|||
public void testUnmappedField() throws IOException, InterruptedException, ExecutionException {
|
||||
int numShards = between(1,5);
|
||||
Builder builder = ImmutableSettings.builder();
|
||||
builder.put("index.number_of_shards", numShards).put("index.number_of_replicas", between(0, 2));
|
||||
builder.put("index.number_of_shards", numShards).put("index.number_of_replicas", between(0, numberOfNodes() - 1));
|
||||
builder.put("index.analysis.analyzer.biword.tokenizer", "standard");
|
||||
builder.putArray("index.analysis.analyzer.biword.filter", "shingler", "lowercase");
|
||||
builder.put("index.analysis.filter.shingler.type", "shingle");
|
||||
|
@ -1149,7 +1147,7 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
|
|||
@Test // see #3469
|
||||
public void testShardFailures() throws IOException, InterruptedException {
|
||||
Builder builder = ImmutableSettings.builder();
|
||||
builder.put("index.number_of_shards", between(1, 5)).put("index.number_of_replicas", between(0, 2));
|
||||
builder.put("index.number_of_shards", between(1, 5)).put("index.number_of_replicas", between(0, numberOfNodes() - 1));
|
||||
builder.put("index.analysis.analyzer.suggest.tokenizer", "standard");
|
||||
builder.putArray("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler");
|
||||
builder.put("index.analysis.filter.shingler.type", "shingle");
|
||||
|
@ -1187,7 +1185,8 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
|
|||
client().prepareIndex("test", "type1", "1")
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "Just testing the suggestions api").endObject()).execute().actionGet();
|
||||
client().prepareIndex("test", "type1", "2")
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title").endObject()).execute().actionGet();
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title about equal length").endObject()).execute().actionGet();
|
||||
// Note that the last document has to have about the same length as the other or cutoff rechecking will remove the useful suggestion.
|
||||
client().admin().indices().prepareRefresh().execute().actionGet();
|
||||
|
||||
// When searching on a shard with a non existing mapping, we should fail
|
||||
|
@ -1240,7 +1239,8 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
|
|||
client().prepareIndex("test", "type1", "1")
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "Just testing the suggestions api").endObject()).execute().actionGet();
|
||||
client().prepareIndex("test", "type1", "2")
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title").endObject()).execute().actionGet();
|
||||
.setSource(XContentFactory.jsonBuilder().startObject().field("name", "An other title about equal length").endObject()).execute().actionGet();
|
||||
// Note that the last document has to have about the same length as the other or cutoff rechecking will remove the useful suggestion.
|
||||
client().admin().indices().prepareRefresh().execute().actionGet();
|
||||
|
||||
SearchRequestBuilder suggestBuilder = client().prepareSearch().setSearchType(SearchType.COUNT);
|
||||
|
@ -1251,4 +1251,72 @@ public class SuggestSearchTests extends AbstractSharedClusterTest {
|
|||
ElasticsearchAssertions.assertNoFailures(searchResponse);
|
||||
ElasticsearchAssertions.assertSuggestion(searchResponse.getSuggest(), 0, 0, "did_you_mean", "testing suggestions");
|
||||
}
|
||||
|
||||
/**
|
||||
* Searching for a rare phrase shouldn't provide any suggestions if confidence > 1. This was possible before we rechecked the cutoff
|
||||
* score during the reduce phase. Failures don't occur every time - maybe two out of five tries but we don't repeat it to save time.
|
||||
*/
|
||||
@Test
|
||||
public void testSearchForRarePhrase() throws ElasticSearchException, IOException {
|
||||
// If there isn't enough chaf per shard then shards can become unbalanced, making the cutoff recheck this is testing do more harm then good.
|
||||
int chafPerShard = 100;
|
||||
Builder builder = ImmutableSettings.builder();
|
||||
int numberOfShards = between(2, 5);
|
||||
builder.put("index.number_of_shards", numberOfShards).put("index.number_of_replicas", between(0, numberOfNodes() - 1));
|
||||
builder.put("index.analysis.analyzer.body.tokenizer", "standard");
|
||||
builder.putArray("index.analysis.analyzer.body.filter", "lowercase", "my_shingle");
|
||||
builder.put("index.analysis.filter.my_shingle.type", "shingle");
|
||||
builder.put("index.analysis.filter.my_shingle.output_unigrams", true);
|
||||
builder.put("index.analysis.filter.my_shingle.min_shingle_size", 2);
|
||||
builder.put("index.analysis.filter.my_shingle.max_shingle_size", 2);
|
||||
|
||||
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
|
||||
.startObject("_all").field("store", "yes").field("termVector", "with_positions_offsets").endObject()
|
||||
.startObject("properties")
|
||||
.startObject("body").field("type", "string").field("analyzer", "body").endObject()
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
|
||||
client().admin().indices().prepareCreate("test").setSettings(builder.build()).addMapping("type1", mapping).execute().actionGet();
|
||||
ensureGreen();
|
||||
List<String> phrases = new ArrayList<String>();
|
||||
Collections.addAll(phrases, "nobel prize", "noble gases", "somethingelse prize", "pride and joy", "notes are fun");
|
||||
for (int i = 0; i < 8; i++) {
|
||||
phrases.add("noble somethingelse" + i);
|
||||
}
|
||||
for (int i = 0; i < numberOfShards * chafPerShard; i++) {
|
||||
phrases.add("chaff" + i);
|
||||
}
|
||||
for (String phrase: phrases) {
|
||||
client().prepareIndex("test", "type1")
|
||||
.setSource(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.field("body", phrase)
|
||||
.endObject()
|
||||
)
|
||||
.execute().actionGet();
|
||||
}
|
||||
refresh();
|
||||
|
||||
Suggest searchSuggest = searchSuggest(client(), "nobel prize", phraseSuggestion("simple_phrase")
|
||||
.field("body")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always").maxTermFreq(.99f))
|
||||
.confidence(2f)
|
||||
.maxErrors(5f)
|
||||
.size(1));
|
||||
ElasticsearchAssertions.assertSuggestionSize(searchSuggest, 0, 0, "simple_phrase");
|
||||
|
||||
searchSuggest = searchSuggest(client(), "noble prize", phraseSuggestion("simple_phrase")
|
||||
.field("body")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always").maxTermFreq(.99f))
|
||||
.confidence(2f)
|
||||
.maxErrors(5f)
|
||||
.size(1));
|
||||
ElasticsearchAssertions.assertSuggestion(searchSuggest, 0, 0, "simple_phrase", "nobel prize");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int numberOfNodes() {
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,6 +42,7 @@ import org.apache.lucene.store.RAMDirectory;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.search.suggest.phrase.*;
|
||||
import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result;
|
||||
import org.elasticsearch.test.integration.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -50,6 +51,7 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.greaterThan;
|
||||
public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
||||
private final BytesRef space = new BytesRef(" ");
|
||||
private final BytesRef preTag = new BytesRef("<em>");
|
||||
|
@ -100,19 +102,23 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
|
||||
Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
|
||||
Correction[] corrections = result.corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
|
||||
assertThat(result.cutoffScore, greaterThan(0d));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
|
||||
result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
|
||||
corrections = result.corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
|
||||
assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));
|
||||
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
|
||||
|
@ -123,7 +129,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel"));
|
||||
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
|
||||
|
@ -133,7 +139,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
// Test some of the highlighting corner cases
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
|
||||
|
@ -168,18 +174,18 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
spellchecker.setMinQueryLength(1);
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
|
||||
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
|
||||
|
||||
// Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
|
||||
}
|
||||
|
@ -241,23 +247,23 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
|
||||
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
|
||||
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
|
||||
|
@ -266,11 +272,11 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
|
@ -323,16 +329,16 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3);
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections;
|
||||
assertThat(corrections.length, equalTo(0));
|
||||
// assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));
|
||||
|
||||
wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
|
||||
|
@ -342,7 +348,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
|
||||
|
@ -350,7 +356,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
|
@ -379,16 +385,16 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
spellchecker.setMinQueryLength(1);
|
||||
suggester = new NoisyChannelSpellChecker(0.95);
|
||||
wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
|
||||
|
||||
wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(2));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
|
||||
|
|
Loading…
Reference in New Issue