mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-25 06:16:40 +00:00
Tie-break suggestions from phrase suggester by term
If the score for two suggestions is the same, we now tie break by term; earlier terms (aaa) sort before later terms (zzz). Closes #5978
This commit is contained in:
parent
f79b28375d
commit
4f7792e64b
@ -18,6 +18,7 @@
|
|||||||
*/
|
*/
|
||||||
package org.elasticsearch.search.suggest.phrase;
|
package org.elasticsearch.search.suggest.phrase;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
@ -42,7 +43,7 @@ final class CandidateScorer {
|
|||||||
PriorityQueue<Correction> corrections = new PriorityQueue<Correction>(maxNumCorrections) {
|
PriorityQueue<Correction> corrections = new PriorityQueue<Correction>(maxNumCorrections) {
|
||||||
@Override
|
@Override
|
||||||
protected boolean lessThan(Correction a, Correction b) {
|
protected boolean lessThan(Correction a, Correction b) {
|
||||||
return a.score < b.score;
|
return a.compareTo(b) < 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
int numMissspellings = 1;
|
int numMissspellings = 1;
|
||||||
@ -98,7 +99,7 @@ final class CandidateScorer {
|
|||||||
Candidate[] c = new Candidate[candidates.length];
|
Candidate[] c = new Candidate[candidates.length];
|
||||||
System.arraycopy(path, 0, c, 0, path.length);
|
System.arraycopy(path, 0, c, 0, path.length);
|
||||||
corrections.add(new Correction(score, c));
|
corrections.add(new Correction(score, c));
|
||||||
} else if (corrections.top().score < score) {
|
} else if (corrections.top().compareTo(score, path) < 0) {
|
||||||
Correction top = corrections.top();
|
Correction top = corrections.top();
|
||||||
System.arraycopy(path, 0, top.candidates, 0, path.length);
|
System.arraycopy(path, 0, top.candidates, 0, path.length);
|
||||||
top.score = score;
|
top.score = score;
|
||||||
|
@ -23,8 +23,9 @@ import org.elasticsearch.search.suggest.SuggestUtils;
|
|||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
//TODO public for tests
|
//TODO public for tests
|
||||||
public final class Correction {
|
public final class Correction implements Comparable<Correction> {
|
||||||
|
|
||||||
public static final Correction[] EMPTY = new Correction[0];
|
public static final Correction[] EMPTY = new Correction[0];
|
||||||
public double score;
|
public double score;
|
||||||
@ -73,4 +74,28 @@ public final class Correction {
|
|||||||
result.grow(len);
|
result.grow(len);
|
||||||
return SuggestUtils.joinPreAllocated(separator, result, toJoin);
|
return SuggestUtils.joinPreAllocated(separator, result, toJoin);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Lower scores sorts first; if scores are equal,
|
||||||
|
* than later terms (zzz) sort first .*/
|
||||||
|
@Override
|
||||||
|
public int compareTo(Correction other) {
|
||||||
|
return compareTo(other.score, other.candidates);
|
||||||
|
}
|
||||||
|
|
||||||
|
int compareTo(double otherScore, Candidate[] otherCandidates) {
|
||||||
|
if (score == otherScore) {
|
||||||
|
int limit = Math.min(candidates.length, otherCandidates.length);
|
||||||
|
for (int i=0;i<limit;i++) {
|
||||||
|
int cmp = candidates[i].term.compareTo(otherCandidates[i].term);
|
||||||
|
if (cmp != 0) {
|
||||||
|
// Later (zzz) terms sort before (are weaker than) earlier (aaa) terms:
|
||||||
|
return -cmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates.length - otherCandidates.length;
|
||||||
|
} else {
|
||||||
|
return Double.compare(score, otherScore);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
@ -30,6 +30,8 @@ import org.elasticsearch.search.suggest.SuggestUtils;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
@ -186,11 +188,15 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void addCandidates(List<Candidate> candidates) {
|
public void addCandidates(List<Candidate> candidates) {
|
||||||
|
// Merge new candidates into existing ones,
|
||||||
|
// deduping:
|
||||||
final Set<Candidate> set = new HashSet<>(candidates);
|
final Set<Candidate> set = new HashSet<>(candidates);
|
||||||
for (int i = 0; i < this.candidates.length; i++) {
|
for (int i = 0; i < this.candidates.length; i++) {
|
||||||
set.add(this.candidates[i]);
|
set.add(this.candidates[i]);
|
||||||
}
|
}
|
||||||
this.candidates = set.toArray(new Candidate[set.size()]);
|
this.candidates = set.toArray(new Candidate[set.size()]);
|
||||||
|
// Sort strongest to weakest:
|
||||||
|
Arrays.sort(this.candidates, Collections.reverseOrder());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addOneCandidate(Candidate candidate) {
|
public void addOneCandidate(Candidate candidate) {
|
||||||
@ -202,7 +208,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class Candidate {
|
public static class Candidate implements Comparable<Candidate> {
|
||||||
public static final Candidate[] EMPTY = new Candidate[0];
|
public static final Candidate[] EMPTY = new Candidate[0];
|
||||||
public final BytesRef term;
|
public final BytesRef term;
|
||||||
public final double stringDistance;
|
public final double stringDistance;
|
||||||
@ -220,7 +226,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", frequency=" + frequency +
|
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score + ", frequency=" + frequency +
|
||||||
(userInput ? ", userInput" : "" ) + "]";
|
(userInput ? ", userInput" : "" ) + "]";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -248,6 +254,17 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
|||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Lower scores sort first; if scores are equal, then later (zzz) terms sort first */
|
||||||
|
@Override
|
||||||
|
public int compareTo(Candidate other) {
|
||||||
|
if (score == other.score) {
|
||||||
|
// Later (zzz) terms sort before earlier (aaa) terms:
|
||||||
|
return other.term.compareTo(term);
|
||||||
|
} else {
|
||||||
|
return Double.compare(score, other.score);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -126,9 +126,9 @@ public final class NoisyChannelSpellChecker {
|
|||||||
double inputPhraseScore = scorer.score(candidates, candidateSets);
|
double inputPhraseScore = scorer.score(candidates, candidateSets);
|
||||||
cutoffScore = inputPhraseScore * confidence;
|
cutoffScore = inputPhraseScore * confidence;
|
||||||
}
|
}
|
||||||
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
||||||
|
|
||||||
return new Result(findBestCandiates, cutoffScore);
|
return new Result(bestCandidates, cutoffScore);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||||
|
@ -672,7 +672,6 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Nightly
|
@Nightly
|
||||||
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/pull/5962")
|
|
||||||
public void testPhraseBoundaryCases() throws ElasticsearchException, IOException {
|
public void testPhraseBoundaryCases() throws ElasticsearchException, IOException {
|
||||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
|
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
|
||||||
.put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
|
.put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
|
||||||
@ -751,10 +750,17 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||||||
phraseSuggestion.field("ngram").analyzer("myDefAnalyzer")
|
phraseSuggestion.field("ngram").analyzer("myDefAnalyzer")
|
||||||
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
|
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
|
||||||
Suggest suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
|
Suggest suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
|
||||||
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
|
|
||||||
|
// "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
|
||||||
|
// earlier term (xorn):
|
||||||
|
assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
|
||||||
|
|
||||||
phraseSuggestion.analyzer(null);
|
phraseSuggestion.analyzer(null);
|
||||||
suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
|
suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
|
||||||
|
|
||||||
|
// In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
|
||||||
|
// probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
|
||||||
|
// others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
|
||||||
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
|
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -268,7 +268,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
|
|||||||
assertThat(corrections.length, equalTo(4));
|
assertThat(corrections.length, equalTo(4));
|
||||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
|
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
|
||||||
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
|
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
|
||||||
|
|
||||||
|
|
||||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
|
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user