Fix Laplace scorer to multiply by alpha (and not add) (#27125)
This commit is contained in:
parent
34666844b3
commit
bd0261916c
|
@ -93,7 +93,7 @@ final class CandidateScorer {
|
|||
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
|
||||
throws IOException {
|
||||
score = Math.exp(score);
|
||||
assert Math.abs(score - score(path, candidates)) < 0.00001;
|
||||
assert Math.abs(score - score(path, candidates)) < 0.00001 : "cur_score=" + score + ", path_score=" + score(path,candidates);
|
||||
if (score > cutoffScore) {
|
||||
if (corrections.size() < maxNumCorrections) {
|
||||
Candidate[] c = new Candidate[candidates.length];
|
||||
|
|
|
@ -38,10 +38,15 @@ final class LaplaceScorer extends WordScorer {
|
|||
return this.alpha;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double scoreUnigram(Candidate word) throws IOException {
|
||||
return (alpha + frequency(word.term)) / (vocabluarySize + alpha * numTerms);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||
join(separator, spare, w_1.term, word.term);
|
||||
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
|
||||
return (alpha + frequency(spare.get())) / (w_1.frequency + alpha * numTerms);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -49,7 +54,7 @@ final class LaplaceScorer extends WordScorer {
|
|||
join(separator, spare, w_2.term, w_1.term, word.term);
|
||||
long trigramCount = frequency(spare.get());
|
||||
join(separator, spare, w_1.term, word.term);
|
||||
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
|
||||
return (alpha + trigramCount) / (frequency(spare.get()) + alpha * numTerms);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -40,8 +40,8 @@ public abstract class WordScorer {
|
|||
protected final double realWordLikelyhood;
|
||||
protected final BytesRefBuilder spare = new BytesRefBuilder();
|
||||
protected final BytesRef separator;
|
||||
protected final long numTerms;
|
||||
private final TermsEnum termsEnum;
|
||||
private final long numTerms;
|
||||
private final boolean useTotalTermFreq;
|
||||
|
||||
public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
|
||||
|
@ -57,10 +57,11 @@ public abstract class WordScorer {
|
|||
final long vocSize = terms.getSumTotalTermFreq();
|
||||
this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize;
|
||||
this.useTotalTermFreq = vocSize != -1;
|
||||
long numTerms = terms.size();
|
||||
// -1 cannot be used as value, because scoreUnigram(...) can then divide by 0 if vocabluarySize is 1.
|
||||
// -1 is returned when terms is a MultiTerms instance.
|
||||
this.numTerms = vocabluarySize + numTerms > 1 ? numTerms : 0;
|
||||
// terms.size() might be -1 if it's a MultiTerms instance. In that case,
|
||||
// use reader.maxDoc() as an approximation. This also protects from
|
||||
// division by zero, by scoreUnigram.
|
||||
final long nTerms = terms.size();
|
||||
this.numTerms = nTerms == -1 ? reader.maxDoc() : nTerms;
|
||||
this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
|
||||
this.reader = reader;
|
||||
this.realWordLikelyhood = realWordLikelyHood;
|
||||
|
|
|
@ -126,7 +126,7 @@ The response contains suggestions scored by the most likely spell correction fir
|
|||
"options" : [ {
|
||||
"text" : "nobel prize",
|
||||
"highlighted": "<em>nobel</em> prize",
|
||||
"score" : 0.5962314
|
||||
"score" : 0.48614594
|
||||
}]
|
||||
}
|
||||
]
|
||||
|
|
Loading…
Reference in New Issue