Tie-break suggestions from phrase suggester by term

If the score for two suggestions is the same, we now tie break by term; earlier terms (aaa) sort before later terms (zzz). Closes #5978
2014-05-18 16:45:37 -04:00 · 2014-05-18 16:45:37 -04:00 · 4f7792e64b
parent f79b28375d
commit 4f7792e64b
6 changed files with 60 additions and 11 deletions
--- a/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java
+++ b/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java
@ -18,6 +18,7 @@
 */
 package org.elasticsearch.search.suggest.phrase;
 import java.io.IOException;
+import java.util.Arrays;

 import org.apache.lucene.util.PriorityQueue;
 import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
@ -42,7 +43,7 @@ final class CandidateScorer {
        PriorityQueue<Correction> corrections = new PriorityQueue<Correction>(maxNumCorrections) {
            @Override
            protected boolean lessThan(Correction a, Correction b) {
-                return a.score < b.score;
+                return a.compareTo(b) < 0;
            }
        };
        int numMissspellings = 1;
@ -98,7 +99,7 @@ final class CandidateScorer {
                Candidate[] c = new Candidate[candidates.length];
                System.arraycopy(path, 0, c, 0, path.length);
                corrections.add(new Correction(score, c));
-            } else if (corrections.top().score < score) {
+            } else if (corrections.top().compareTo(score, path) < 0) {
                Correction top = corrections.top();
                System.arraycopy(path, 0, top.candidates, 0, path.length);
                top.score = score;
--- a/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java
+++ b/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java
@ -23,8 +23,9 @@ import org.elasticsearch.search.suggest.SuggestUtils;
 import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;

 import java.util.Arrays;
+
 //TODO public for tests
-public final class Correction {
+public final class Correction implements Comparable<Correction> {

    public static final Correction[] EMPTY = new Correction[0];
    public double score;
@ -73,4 +74,28 @@ public final class Correction {
        result.grow(len);
        return SuggestUtils.joinPreAllocated(separator, result, toJoin);
    }
-}
+
+    /** Lower scores sorts first; if scores are equal,
+     *  than later terms (zzz) sort first .*/
+    @Override
+    public int compareTo(Correction other) {
+        return compareTo(other.score, other.candidates);
+    }
+
+    int compareTo(double otherScore, Candidate[] otherCandidates) {
+        if (score == otherScore) {
+            int limit = Math.min(candidates.length, otherCandidates.length);
+            for (int i=0;i<limit;i++) {
+                int cmp = candidates[i].term.compareTo(otherCandidates[i].term);
+                if (cmp != 0) {
+                    // Later (zzz) terms sort before (are weaker than) earlier (aaa) terms:
+                    return -cmp;
+                }
+            }
+
+            return candidates.length - otherCandidates.length;
+        } else {
+            return Double.compare(score, otherScore);
+        }
+    }
+}
--- a/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java
+++ b/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java
@ -30,6 +30,8 @@ import org.elasticsearch.search.suggest.SuggestUtils;

 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@ -186,11 +188,15 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
        }
        
        public void addCandidates(List<Candidate> candidates) {
+            // Merge new candidates into existing ones,
+            // deduping:
            final Set<Candidate> set = new HashSet<>(candidates);
            for (int i = 0; i < this.candidates.length; i++) {
                set.add(this.candidates[i]);
            }
            this.candidates = set.toArray(new Candidate[set.size()]);
+            // Sort strongest to weakest:
+            Arrays.sort(this.candidates, Collections.reverseOrder());
        }

        public void addOneCandidate(Candidate candidate) {
@ -202,7 +208,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {

    }

-    public static class Candidate {
+    public static class Candidate implements Comparable<Candidate> {
        public static final Candidate[] EMPTY = new Candidate[0];
        public final BytesRef term;
        public final double stringDistance;
@ -220,7 +226,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {

        @Override
        public String toString() {
-            return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", frequency=" + frequency + 
+            return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score + ", frequency=" + frequency + 
                    (userInput ? ", userInput" : "" ) + "]";
        }

@ -248,6 +254,17 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
                return false;
            return true;
        }
+
+        /** Lower scores sort first; if scores are equal, then later (zzz) terms sort first */
+        @Override
+        public int compareTo(Candidate other) {
+            if (score == other.score) {
+                // Later (zzz) terms sort before earlier (aaa) terms:
+                return other.term.compareTo(term);
+            } else {
+                return Double.compare(score, other.score);
+            }
+        }
    }

    @Override
--- a/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java
+++ b/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java
@ -126,9 +126,9 @@ public final class NoisyChannelSpellChecker {
            double inputPhraseScore = scorer.score(candidates, candidateSets);
            cutoffScore = inputPhraseScore * confidence;
        }
-        Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
+        Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
        
-        return new Result(findBestCandiates, cutoffScore);
+        return new Result(bestCandidates, cutoffScore);
    }

    public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
--- a/src/test/java/org/elasticsearch/search/suggest/SuggestSearchTests.java
+++ b/src/test/java/org/elasticsearch/search/suggest/SuggestSearchTests.java
@ -672,7 +672,6 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {

    @Test
    @Nightly
-    @LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/pull/5962")
    public void testPhraseBoundaryCases() throws ElasticsearchException, IOException {
        CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
                .put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
@ -751,10 +750,17 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
        phraseSuggestion.field("ngram").analyzer("myDefAnalyzer")
                .addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
        Suggest suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
-        assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
+
+        // "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
+        // earlier term (xorn):
+        assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");

        phraseSuggestion.analyzer(null);
        suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
+
+        // In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
+        // probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
+        // others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
        assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
    }

--- a/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java
+++ b/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java
@ -268,7 +268,7 @@ public class NoisyChannelSpellCheckerTests extends ElasticsearchTestCase{
        assertThat(corrections.length, equalTo(4));
        assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
        assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
-        assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
+        assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));


        corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;