From bd0261916c19a73da351d53d465d2394f6bde408 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Tue, 31 Oct 2017 14:08:44 +0200 Subject: [PATCH] Fix Laplace scorer to multiply by alpha (and not add) (#27125) --- .../search/suggest/phrase/CandidateScorer.java | 2 +- .../search/suggest/phrase/LaplaceScorer.java | 9 +++++++-- .../search/suggest/phrase/WordScorer.java | 11 ++++++----- .../search/suggesters/phrase-suggest.asciidoc | 2 +- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java index d24ce6b3c29..3928a16b7c9 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java @@ -93,7 +93,7 @@ final class CandidateScorer { private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue corrections, double cutoffScore, double score) throws IOException { score = Math.exp(score); - assert Math.abs(score - score(path, candidates)) < 0.00001; + assert Math.abs(score - score(path, candidates)) < 0.00001 : "cur_score=" + score + ", path_score=" + score(path,candidates); if (score > cutoffScore) { if (corrections.size() < maxNumCorrections) { Candidate[] c = new Candidate[candidates.length]; diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java index 562da448466..d9797a4207e 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java @@ -38,10 +38,15 @@ final class LaplaceScorer extends WordScorer { return this.alpha; } + @Override + protected double scoreUnigram(Candidate word) throws IOException { + return (alpha + frequency(word.term)) / (vocabluarySize + alpha * numTerms); + } + @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { join(separator, spare, w_1.term, word.term); - return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize); + return (alpha + frequency(spare.get())) / (w_1.frequency + alpha * numTerms); } @Override @@ -49,7 +54,7 @@ final class LaplaceScorer extends WordScorer { join(separator, spare, w_2.term, w_1.term, word.term); long trigramCount = frequency(spare.get()); join(separator, spare, w_1.term, word.term); - return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize); + return (alpha + trigramCount) / (frequency(spare.get()) + alpha * numTerms); } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java index a1c41e40151..22515489ee2 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java @@ -40,8 +40,8 @@ public abstract class WordScorer { protected final double realWordLikelyhood; protected final BytesRefBuilder spare = new BytesRefBuilder(); protected final BytesRef separator; + protected final long numTerms; private final TermsEnum termsEnum; - private final long numTerms; private final boolean useTotalTermFreq; public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException { @@ -57,10 +57,11 @@ public abstract class WordScorer { final long vocSize = terms.getSumTotalTermFreq(); this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize; this.useTotalTermFreq = vocSize != -1; - long numTerms = terms.size(); - // -1 cannot be used as value, because scoreUnigram(...) can then divide by 0 if vocabluarySize is 1. - // -1 is returned when terms is a MultiTerms instance. - this.numTerms = vocabluarySize + numTerms > 1 ? numTerms : 0; + // terms.size() might be -1 if it's a MultiTerms instance. In that case, + // use reader.maxDoc() as an approximation. This also protects from + // division by zero, by scoreUnigram. + final long nTerms = terms.size(); + this.numTerms = nTerms == -1 ? reader.maxDoc() : nTerms; this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now this.reader = reader; this.realWordLikelyhood = realWordLikelyHood; diff --git a/docs/reference/search/suggesters/phrase-suggest.asciidoc b/docs/reference/search/suggesters/phrase-suggest.asciidoc index 92138e7ecdf..cba299e97cb 100644 --- a/docs/reference/search/suggesters/phrase-suggest.asciidoc +++ b/docs/reference/search/suggesters/phrase-suggest.asciidoc @@ -126,7 +126,7 @@ The response contains suggestions scored by the most likely spell correction fir "options" : [ { "text" : "nobel prize", "highlighted": "nobel prize", - "score" : 0.5962314 + "score" : 0.48614594 }] } ]