From 10e55bd3ef4f14e667981b0688eaa779fe250a84 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Wed, 21 Aug 2013 11:05:53 -0400 Subject: [PATCH] Recheck cutoffScore during phrase_suggest merge. The goal is to throw out suggestions that only meet the cutoff in some shards. This will happen if your input phrase is only contained in a few shards. If your shards are unbanced this rechecking can throw out good suggestions. Closes #3547. --- .../elasticsearch/search/suggest/Suggest.java | 23 +++- .../phrase/NoisyChannelSpellChecker.java | 32 +++-- .../suggest/phrase/PhraseSuggester.java | 27 ++-- .../suggest/phrase/PhraseSuggestion.java | 120 ++++++++++++++++++ .../search/suggest/SuggestSearchTests.java | 82 +++++++++++- .../phrase/NoisyChannelSpellCheckerTests.java | 52 ++++---- 6 files changed, 279 insertions(+), 57 deletions(-) create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestion.java diff --git a/src/main/java/org/elasticsearch/search/suggest/Suggest.java b/src/main/java/org/elasticsearch/search/suggest/Suggest.java index e5c0340d7bd..1776dadd4c5 100644 --- a/src/main/java/org/elasticsearch/search/suggest/Suggest.java +++ b/src/main/java/org/elasticsearch/search/suggest/Suggest.java @@ -31,6 +31,7 @@ import org.elasticsearch.common.xcontent.XContentBuilderString; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option; import org.elasticsearch.search.suggest.completion.CompletionSuggestion; +import org.elasticsearch.search.suggest.phrase.PhraseSuggestion; import org.elasticsearch.search.suggest.term.TermSuggestion; import java.io.IOException; @@ -119,6 +120,9 @@ public class Suggest implements Iterable>(); break; @@ -357,7 +361,7 @@ public class Suggest implements Iterable reduce(List> toReduce) { + protected Entry reduce(List> toReduce) { if (toReduce.size() == 1) { return toReduce.get(0); } @@ -367,20 +371,29 @@ public class Suggest implements Iterable other) { + } + /** * @return the text (analyzed by suggest analyzer) originating from the suggest text. Usually this is a * single term. diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java index 3a17b045ae0..02f0b83f538 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java @@ -18,10 +18,6 @@ */ package org.elasticsearch.search.suggest.phrase; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.shingle.ShingleFilter; @@ -36,6 +32,10 @@ import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + //TODO public for tests public final class NoisyChannelSpellChecker { public static final double REAL_WORD_LIKELYHOOD = 0.95d; @@ -59,7 +59,7 @@ public final class NoisyChannelSpellChecker { } - public Correction[] getCorrections(TokenStream stream, final CandidateGenerator generator, + public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, IndexReader reader, WordScorer wordScorer, BytesRef separator, float confidence, int gramSize) throws IOException { final List candidateSetsList = new ArrayList(); @@ -109,7 +109,7 @@ public final class NoisyChannelSpellChecker { }); if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) { - return Correction.EMPTY; + return Result.EMPTY; } for (CandidateSet candidateSet : candidateSetsList) { @@ -123,14 +123,15 @@ public final class NoisyChannelSpellChecker { for (int i = 0; i < candidates.length; i++) { candidates[i] = candidateSets[i].originalTerm; } - cutoffScore = scorer.score(candidates, candidateSets); + double inputPhraseScore = scorer.score(candidates, candidateSets); + cutoffScore = inputPhraseScore * confidence; } - Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore * confidence); + Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore); - return findBestCandiates; + return new Result(findBestCandiates, cutoffScore); } - public Correction[] getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator, + public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator, float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException { return getCorrections(tokenStream(analyzer, query, new CharsRef(), analysisField), generator, maxErrors, numCorrections, reader, scorer, new BytesRef(" "), confidence, gramSize); @@ -141,6 +142,15 @@ public final class NoisyChannelSpellChecker { UnicodeUtil.UTF8toUTF16(query, spare); return analyzer.tokenStream(field, new FastCharArrayReader(spare.chars, spare.offset, spare.length)); } - + public static class Result { + public static final Result EMPTY = new Result(Correction.EMPTY, Double.MIN_VALUE); + public final Correction[] corrections; + public final double cutoffScore; + + public Result(Correction[] corrections, double cutoffScore) { + this.corrections = corrections; + this.cutoffScore = cutoffScore; + } + } } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java b/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java index 1c06409647e..d827ba300d9 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java @@ -33,14 +33,12 @@ import org.elasticsearch.search.suggest.Suggest.Suggestion; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option; import org.elasticsearch.search.suggest.*; +import org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.io.IOException; -import java.util.List; - public final class PhraseSuggester extends Suggester { private final BytesRef SEPARATOR = new BytesRef(" "); @@ -56,11 +54,8 @@ public final class PhraseSuggester extends Suggester { public Suggestion> innerExecute(String name, PhraseSuggestionContext suggestion, IndexReader indexReader, CharsRef spare) throws IOException { double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood(); - UnicodeUtil.UTF8toUTF16(suggestion.getText(), spare); - Suggestion.Entry