Tie-break suggestions from phrase suggester by term

If the score for two suggestions is the same, we now tie break by term; earlier terms (aaa) sort before later terms (zzz).

Closes #5978
This commit is contained in:
mikemccand 2014-05-18 16:45:37 -04:00
parent f79b28375d
commit 4f7792e64b
6 changed files with 60 additions and 11 deletions

View File

@ -18,6 +18,7 @@
*/
package org.elasticsearch.search.suggest.phrase;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.util.PriorityQueue;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
@ -42,7 +43,7 @@ final class CandidateScorer {
PriorityQueue<Correction> corrections = new PriorityQueue<Correction>(maxNumCorrections) {
@Override
protected boolean lessThan(Correction a, Correction b) {
return a.score < b.score;
return a.compareTo(b) < 0;
}
};
int numMissspellings = 1;
@ -98,7 +99,7 @@ final class CandidateScorer {
Candidate[] c = new Candidate[candidates.length];
System.arraycopy(path, 0, c, 0, path.length);
corrections.add(new Correction(score, c));
} else if (corrections.top().score < score) {
} else if (corrections.top().compareTo(score, path) < 0) {
Correction top = corrections.top();
System.arraycopy(path, 0, top.candidates, 0, path.length);
top.score = score;

View File

@ -23,8 +23,9 @@ import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.util.Arrays;
//TODO public for tests
public final class Correction {
public final class Correction implements Comparable<Correction> {
public static final Correction[] EMPTY = new Correction[0];
public double score;
@ -73,4 +74,28 @@ public final class Correction {
result.grow(len);
return SuggestUtils.joinPreAllocated(separator, result, toJoin);
}
}
/** Lower scores sorts first; if scores are equal,
* than later terms (zzz) sort first .*/
@Override
public int compareTo(Correction other) {
return compareTo(other.score, other.candidates);
}
int compareTo(double otherScore, Candidate[] otherCandidates) {
if (score == otherScore) {
int limit = Math.min(candidates.length, otherCandidates.length);
for (int i=0;i<limit;i++) {
int cmp = candidates[i].term.compareTo(otherCandidates[i].term);
if (cmp != 0) {
// Later (zzz) terms sort before (are weaker than) earlier (aaa) terms:
return -cmp;
}
}
return candidates.length - otherCandidates.length;
} else {
return Double.compare(score, otherScore);
}
}
}

View File

@ -30,6 +30,8 @@ import org.elasticsearch.search.suggest.SuggestUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@ -186,11 +188,15 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
}
public void addCandidates(List<Candidate> candidates) {
// Merge new candidates into existing ones,
// deduping:
final Set<Candidate> set = new HashSet<>(candidates);
for (int i = 0; i < this.candidates.length; i++) {
set.add(this.candidates[i]);
}
this.candidates = set.toArray(new Candidate[set.size()]);
// Sort strongest to weakest:
Arrays.sort(this.candidates, Collections.reverseOrder());
}
public void addOneCandidate(Candidate candidate) {
@ -202,7 +208,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
}
public static class Candidate {
public static class Candidate implements Comparable<Candidate> {
public static final Candidate[] EMPTY = new Candidate[0];
public final BytesRef term;
public final double stringDistance;
@ -220,7 +226,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
@Override
public String toString() {
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", frequency=" + frequency +
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score + ", frequency=" + frequency +
(userInput ? ", userInput" : "" ) + "]";
}
@ -248,6 +254,17 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
return false;
return true;
}
/** Lower scores sort first; if scores are equal, then later (zzz) terms sort first */
@Override
public int compareTo(Candidate other) {
if (score == other.score) {
// Later (zzz) terms sort before earlier (aaa) terms:
return other.term.compareTo(term);
} else {
return Double.compare(score, other.score);
}
}
}
@Override

View File

@ -126,9 +126,9 @@ public final class NoisyChannelSpellChecker {
double inputPhraseScore = scorer.score(candidates, candidateSets);
cutoffScore = inputPhraseScore * confidence;
}
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return new Result(findBestCandiates, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,

View File

@ -672,7 +672,6 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
@Test
@Nightly
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/pull/5962")
public void testPhraseBoundaryCases() throws ElasticsearchException, IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
.put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
@ -751,10 +750,17 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
phraseSuggestion.field("ngram").analyzer("myDefAnalyzer")
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
Suggest suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
// "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
// earlier term (xorn):
assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
phraseSuggestion.analyzer(null);
suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
// In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
// probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
// others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
}

View File

@ -268,7 +268,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;