Tie-break suggestions from phrase suggester by term
If the score for two suggestions is the same, we now tie break by term; earlier terms (aaa) sort before later terms (zzz). Closes #5978
This commit is contained in:
parent
f79b28375d
commit
4f7792e64b
|
@ -18,6 +18,7 @@
|
|||
*/
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
|
@ -42,7 +43,7 @@ final class CandidateScorer {
|
|||
PriorityQueue<Correction> corrections = new PriorityQueue<Correction>(maxNumCorrections) {
|
||||
@Override
|
||||
protected boolean lessThan(Correction a, Correction b) {
|
||||
return a.score < b.score;
|
||||
return a.compareTo(b) < 0;
|
||||
}
|
||||
};
|
||||
int numMissspellings = 1;
|
||||
|
@ -98,7 +99,7 @@ final class CandidateScorer {
|
|||
Candidate[] c = new Candidate[candidates.length];
|
||||
System.arraycopy(path, 0, c, 0, path.length);
|
||||
corrections.add(new Correction(score, c));
|
||||
} else if (corrections.top().score < score) {
|
||||
} else if (corrections.top().compareTo(score, path) < 0) {
|
||||
Correction top = corrections.top();
|
||||
System.arraycopy(path, 0, top.candidates, 0, path.length);
|
||||
top.score = score;
|
||||
|
|
|
@ -23,8 +23,9 @@ import org.elasticsearch.search.suggest.SuggestUtils;
|
|||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
//TODO public for tests
|
||||
public final class Correction {
|
||||
public final class Correction implements Comparable<Correction> {
|
||||
|
||||
public static final Correction[] EMPTY = new Correction[0];
|
||||
public double score;
|
||||
|
@ -73,4 +74,28 @@ public final class Correction {
|
|||
result.grow(len);
|
||||
return SuggestUtils.joinPreAllocated(separator, result, toJoin);
|
||||
}
|
||||
}
|
||||
|
||||
/** Lower scores sorts first; if scores are equal,
|
||||
* than later terms (zzz) sort first .*/
|
||||
@Override
|
||||
public int compareTo(Correction other) {
|
||||
return compareTo(other.score, other.candidates);
|
||||
}
|
||||
|
||||
int compareTo(double otherScore, Candidate[] otherCandidates) {
|
||||
if (score == otherScore) {
|
||||
int limit = Math.min(candidates.length, otherCandidates.length);
|
||||
for (int i=0;i<limit;i++) {
|
||||
int cmp = candidates[i].term.compareTo(otherCandidates[i].term);
|
||||
if (cmp != 0) {
|
||||
// Later (zzz) terms sort before (are weaker than) earlier (aaa) terms:
|
||||
return -cmp;
|
||||
}
|
||||
}
|
||||
|
||||
return candidates.length - otherCandidates.length;
|
||||
} else {
|
||||
return Double.compare(score, otherScore);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,8 @@ import org.elasticsearch.search.suggest.SuggestUtils;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
@ -186,11 +188,15 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||
}
|
||||
|
||||
public void addCandidates(List<Candidate> candidates) {
|
||||
// Merge new candidates into existing ones,
|
||||
// deduping:
|
||||
final Set<Candidate> set = new HashSet<>(candidates);
|
||||
for (int i = 0; i < this.candidates.length; i++) {
|
||||
set.add(this.candidates[i]);
|
||||
}
|
||||
this.candidates = set.toArray(new Candidate[set.size()]);
|
||||
// Sort strongest to weakest:
|
||||
Arrays.sort(this.candidates, Collections.reverseOrder());
|
||||
}
|
||||
|
||||
public void addOneCandidate(Candidate candidate) {
|
||||
|
@ -202,7 +208,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||
|
||||
}
|
||||
|
||||
public static class Candidate {
|
||||
public static class Candidate implements Comparable<Candidate> {
|
||||
public static final Candidate[] EMPTY = new Candidate[0];
|
||||
public final BytesRef term;
|
||||
public final double stringDistance;
|
||||
|
@ -220,7 +226,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", frequency=" + frequency +
|
||||
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score + ", frequency=" + frequency +
|
||||
(userInput ? ", userInput" : "" ) + "]";
|
||||
}
|
||||
|
||||
|
@ -248,6 +254,17 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Lower scores sort first; if scores are equal, then later (zzz) terms sort first */
|
||||
@Override
|
||||
public int compareTo(Candidate other) {
|
||||
if (score == other.score) {
|
||||
// Later (zzz) terms sort before earlier (aaa) terms:
|
||||
return other.term.compareTo(term);
|
||||
} else {
|
||||
return Double.compare(score, other.score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -126,9 +126,9 @@ public final class NoisyChannelSpellChecker {
|
|||
double inputPhraseScore = scorer.score(candidates, candidateSets);
|
||||
cutoffScore = inputPhraseScore * confidence;
|
||||
}
|
||||
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
||||
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
||||
|
||||
return new Result(findBestCandiates, cutoffScore);
|
||||
return new Result(bestCandidates, cutoffScore);
|
||||
}
|
||||
|
||||
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||
|
|
|
@ -672,7 +672,6 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||
|
||||
@Test
|
||||
@Nightly
|
||||
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/pull/5962")
|
||||
public void testPhraseBoundaryCases() throws ElasticsearchException, IOException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
|
||||
.put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
|
||||
|
@ -751,10 +750,17 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||
phraseSuggestion.field("ngram").analyzer("myDefAnalyzer")
|
||||
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
|
||||
Suggest suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
|
||||
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
|
||||
|
||||
// "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
|
||||
// earlier term (xorn):
|
||||
assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
|
||||
|
||||
phraseSuggestion.analyzer(null);
|
||||
suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
|
||||
|
||||
// In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
|
||||
// probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
|
||||
// others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
|
||||
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
|
||||
}
|
||||
|
||||
|
|
|
@ -268,7 +268,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
|
||||
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
|
||||
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
|
||||
|
|
Loading…
Reference in New Issue