diff --git a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java index b1f4320043c..62d3bd6f505 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java @@ -26,8 +26,8 @@ import org.apache.lucene.search.Explanation; public class AfterEffectB extends AfterEffect { @Override public final float score(BasicStats stats, float tfn) { - long F = stats.getTotalTermFreq(); - int n = stats.getDocFreq(); + long F = stats.getTotalTermFreq()+1; + int n = stats.getDocFreq()+1; return (F + 1) / (n * (tfn + 1)); } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java index 2ac9165d48d..125147104af 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelBE.java @@ -21,14 +21,14 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; /** * Limiting form of the Bose-Einstein model. The formula used in Lucene differs - * slightly from the one in the original paper: {@code F} is increased by {@code tfn} + * slightly from the one in the original paper: {@code F} is increased by {@code tfn+1} * and {@code N} is increased by {@code F} * @lucene.experimental */ public class BasicModelBE extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - double F = stats.getTotalTermFreq() + tfn; + double F = stats.getTotalTermFreq() + 1 + tfn; // approximation only holds true when F << N, so we use N += F double N = F + stats.getNumberOfDocuments(); return (float)(-log2((N - 1) * Math.E) diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java index 04b76a6504a..b32356f1529 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelD.java @@ -24,7 +24,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; * for DFR. The formula used in Lucene differs slightly from the one in the * original paper: to avoid underflow for small values of {@code N} and * {@code F}, {@code N} is increased by {@code 1} and - * {@code F} is always increased by {@code tfn}. + * {@code F} is always increased by {@code tfn+1}. *

* WARNING: for terms that do not meet the expected random distribution * (e.g. stopwords), this model may give poor performance, such as @@ -37,7 +37,7 @@ public class BasicModelD extends BasicModel { // we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative, // resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq // to create a 'normalized' F. - double F = stats.getTotalTermFreq() + tfn; + double F = stats.getTotalTermFreq() + 1 + tfn; double phi = (double)tfn / F; double nphi = 1 - phi; double p = 1.0 / (stats.getNumberOfDocuments() + 1); diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java index edd50b0f00f..3c3a6923511 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelG.java @@ -21,7 +21,7 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; /** * Geometric as limiting form of the Bose-Einstein model. The formula used in Lucene differs - * slightly from the one in the original paper: {@code F} is increased by {@code tfn} + * slightly from the one in the original paper: {@code F} is increased by {@code 1} * and {@code N} is increased by {@code F}. * @lucene.experimental */ @@ -29,7 +29,9 @@ public class BasicModelG extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { // just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F) - double lambda = stats.getTotalTermFreq() / (double) (stats.getNumberOfDocuments() + stats.getTotalTermFreq()); + double F = stats.getTotalTermFreq() + 1; + double N = stats.getNumberOfDocuments(); + double lambda = F / (N + F); // -log(1 / (lambda + 1)) -> log(lambda + 1) return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda)); } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java index 41a88232ec2..d78f7ad238a 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/BasicModelP.java @@ -33,7 +33,7 @@ public class BasicModelP extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - float lambda = (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments(); + float lambda = (float)(stats.getTotalTermFreq()+1) / (stats.getNumberOfDocuments()+1); return (float)(tfn * log2(tfn / lambda) + (lambda + 1 / (12 * tfn) - tfn) * LOG2_E + 0.5 * log2(2 * Math.PI * tfn)); diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java index bf1df961169..1679ea83f26 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -144,7 +144,7 @@ public abstract class LMSimilarity extends SimilarityBase { public static class DefaultCollectionModel implements CollectionModel { @Override public float computeProbability(BasicStats stats) { - return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1); + return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java b/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java index 7e4a8240b87..73ae600c9a6 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaDF.java @@ -20,13 +20,13 @@ package org.apache.lucene.search.similarities; import org.apache.lucene.search.Explanation; /** - * Computes lambda as {@code totalTermFreq / numberOfDocuments}. + * Computes lambda as {@code docFreq+1 / numberOfDocuments+1}. * @lucene.experimental */ public class LambdaDF extends Lambda { @Override public final float lambda(BasicStats stats) { - return (float)stats.getDocFreq() / stats.getNumberOfDocuments(); + return (stats.getDocFreq()+1F) / (stats.getNumberOfDocuments()+1F); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java b/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java index 25c55bd72ce..702409589bc 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/LambdaTTF.java @@ -20,13 +20,13 @@ package org.apache.lucene.search.similarities; import org.apache.lucene.search.Explanation; /** - * Computes lambda as {@code docFreq / numberOfDocuments}. + * Computes lambda as {@code totalTermFreq+1 / numberOfDocuments+1}. * @lucene.experimental */ public class LambdaTTF extends Lambda { @Override public final float lambda(BasicStats stats) { - return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments(); + return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfDocuments()+1F); } @Override diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java index 97bf86a1135..b00fc17b3d6 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH3.java @@ -34,7 +34,7 @@ public class NormalizationH3 extends Normalization { @Override public float tfn(BasicStats stats, float tf, float len) { - return (tf + mu * (stats.getTotalTermFreq() / (float)stats.getNumberOfFieldTokens())) / (len + mu) * mu; + return (tf + mu * ((stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F))) / (len + mu) * mu; } @Override diff --git a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java index 110db076495..ffa9308e9dd 100644 --- a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java +++ b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarity2.java @@ -36,8 +36,12 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.spans.SpanOrQuery; +import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -215,4 +219,36 @@ public class TestSimilarity2 extends LuceneTestCase { ir.close(); dir.close(); } + + /** make sure all sims work with spanOR(termX, termY) where termY does not exist */ + public void testCrazySpans() throws Exception { + // The problem: "normal" lucene queries create scorers, returning null if terms dont exist + // This means they never score a term that does not exist. + // however with spans, there is only one scorer for the whole hierarchy: + // inner queries are not real queries, their boosts are ignored, etc. + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random, dir); + Document doc = new Document(); + FieldType ft = new FieldType(TextField.TYPE_UNSTORED); + doc.add(newField("foo", "bar", ft)); + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + IndexSearcher is = newSearcher(ir); + + for (SimilarityProvider simProvider : simProviders) { + is.setSimilarityProvider(simProvider); + SpanTermQuery s1 = new SpanTermQuery(new Term("foo", "bar")); + SpanTermQuery s2 = new SpanTermQuery(new Term("foo", "baz")); + Query query = new SpanOrQuery(s1, s2); + TopDocs td = is.search(query, 10); + assertEquals(1, td.totalHits); + float score = td.scoreDocs[0].score; + assertTrue(score >= 0.0f); + assertFalse("inf score for " + simProvider, Float.isInfinite(score)); + } + is.close(); + ir.close(); + dir.close(); + } } diff --git a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 4f50880302e..d438cc7659d 100644 --- a/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -388,11 +388,11 @@ public class TestSimilarityBase extends LuceneTestCase { /** Correctness test for the Dirichlet LM model. */ public void testLMDirichlet() throws IOException { float p = - (FREQ + 2000.0f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f)) / + (FREQ + 2000.0f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f)) / (DOC_LEN + 2000.0f); float a = 2000.0f / (DOC_LEN + 2000.0f); float gold = (float)( - Math.log(p / (a * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f))) + + Math.log(p / (a * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f))) + Math.log(a)); correctnessTestCore(new LMDirichletSimilarity(), gold); } @@ -400,9 +400,9 @@ public class TestSimilarityBase extends LuceneTestCase { /** Correctness test for the Jelinek-Mercer LM model. */ public void testLMJelinekMercer() throws IOException { float p = (1 - 0.1f) * FREQ / DOC_LEN + - 0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f); + 0.1f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f); float gold = (float)(Math.log( - p / (0.1f * TOTAL_TERM_FREQ / (NUMBER_OF_FIELD_TOKENS + 1.0f)))); + p / (0.1f * (TOTAL_TERM_FREQ + 1) / (NUMBER_OF_FIELD_TOKENS + 1.0f)))); correctnessTestCore(new LMJelinekMercerSimilarity(0.1f), gold); } @@ -412,7 +412,7 @@ public class TestSimilarityBase extends LuceneTestCase { */ public void testLLForIB() throws IOException { SimilarityBase sim = new IBSimilarity(new DistributionLL(), new LambdaDF(), new Normalization.NoNormalization()); - correctnessTestCore(sim, 4.26267987704f); + correctnessTestCore(sim, 4.178574562072754f); } /** @@ -422,7 +422,7 @@ public class TestSimilarityBase extends LuceneTestCase { public void testSPLForIB() throws IOException { SimilarityBase sim = new IBSimilarity(new DistributionSPL(), new LambdaTTF(), new Normalization.NoNormalization()); - correctnessTestCore(sim, 2.24069910825f); + correctnessTestCore(sim, 2.2387237548828125f); } /** Correctness test for the PL2 DFR model. */ @@ -432,11 +432,11 @@ public class TestSimilarityBase extends LuceneTestCase { float tfn = (float)(FREQ * SimilarityBase.log2( 1 + AVG_FIELD_LENGTH / DOC_LEN)); // 8.1894750101 float l = 1.0f / (tfn + 1.0f); // 0.108820144666 - float lambda = (1.0f * TOTAL_TERM_FREQ) / NUMBER_OF_DOCUMENTS; // 0.7 + float lambda = (1.0f + TOTAL_TERM_FREQ) / (1f + NUMBER_OF_DOCUMENTS); // 0.7029703 float p = (float)(tfn * SimilarityBase.log2(tfn / lambda) + (lambda + 1 / (12 * tfn) - tfn) * SimilarityBase.log2(Math.E) + - 0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.1113611585 - float gold = l * p; // 2.29734137536 + 0.5 * SimilarityBase.log2(2 * Math.PI * tfn)); // 21.065619 + float gold = l * p; // 2.2923636 correctnessTestCore(sim, gold); } @@ -444,14 +444,14 @@ public class TestSimilarityBase extends LuceneTestCase { public void testIneB2() throws IOException { SimilarityBase sim = new DFRSimilarity( new BasicModelIne(), new AfterEffectB(), new NormalizationH2()); - correctnessTestCore(sim, 6.23455315685f); + correctnessTestCore(sim, 5.747603416442871f); } /** Correctness test for the GL1 DFR model. */ public void testGL1() throws IOException { SimilarityBase sim = new DFRSimilarity( new BasicModelG(), new AfterEffectL(), new NormalizationH1()); - correctnessTestCore(sim, 1.6463143825531006f); + correctnessTestCore(sim, 1.6390540599822998f); } /** Correctness test for the BEB1 DFR model. */ @@ -459,34 +459,34 @@ public class TestSimilarityBase extends LuceneTestCase { SimilarityBase sim = new DFRSimilarity( new BasicModelBE(), new AfterEffectB(), new NormalizationH1()); float tfn = FREQ * AVG_FIELD_LENGTH / DOC_LEN; // 8.75 - float b = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (tfn + 1)); // 0.728205128205 - float f = TOTAL_TERM_FREQ + tfn; - float n = f + NUMBER_OF_DOCUMENTS; - float n1 = n + f - 1; // 256.5 - float m1 = n + f - tfn - 2; // 246.75 - float n2 = f; // 78.75 - float m2 = f - tfn; // 70.0 + float b = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (tfn + 1)); // 0.67132866 + double f = TOTAL_TERM_FREQ + 1 + tfn; + double n = f + NUMBER_OF_DOCUMENTS; + double n1 = n + f - 1; // 258.5 + double m1 = n + f - tfn - 2; // 248.75 + double n2 = f; // 79.75 + double m2 = f - tfn; // 71.0 float be = (float)(-SimilarityBase.log2(n - 1) - - SimilarityBase.log2(Math.E) + // -8.916400790508378 + SimilarityBase.log2(Math.E) + // -8.924494472554715 ((m1 + 0.5f) * SimilarityBase.log2(n1 / m1) + - (n1 - m1) * SimilarityBase.log2(n1)) - // 91.85089272283668 + (n1 - m1) * SimilarityBase.log2(n1)) - // 91.9620374903885 ((m2 + 0.5f) * SimilarityBase.log2(n2 / m2) + - (n2 - m2) * SimilarityBase.log2(n2))); // 67.09778276257171 - // 15.836709 - float gold = b * be; // 11.532373 + (n2 - m2) * SimilarityBase.log2(n2))); // 67.26544321004599 + // 15.7720995 + float gold = b * be; // 10.588263 correctnessTestCore(sim, gold); } /** Correctness test for the D DFR model (basic model only). */ public void testD() throws IOException { SimilarityBase sim = new DFRSimilarity(new BasicModelD(), new AfterEffect.NoAfterEffect(), new Normalization.NoNormalization()); - double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ; - double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099 - double phi = FREQ / totalTermFreqNorm; // 0.09090909090909091 - double D = phi * SimilarityBase.log2(phi / p) + // 0.17884523239871358 + double totalTermFreqNorm = TOTAL_TERM_FREQ + FREQ + 1; + double p = 1.0 / (NUMBER_OF_DOCUMENTS + 1); // 0.009900990099009901 + double phi = FREQ / totalTermFreqNorm; // 0.08974358974358974 + double D = phi * SimilarityBase.log2(phi / p) + // 0.17498542370019005 (1 - phi) * SimilarityBase.log2((1 - phi) / (1 - p)); float gold = (float)(totalTermFreqNorm * D + 0.5 * SimilarityBase.log2( - 1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.449575 + 1 + 2 * Math.PI * FREQ * (1 - phi))); // 16.328257 correctnessTestCore(sim, gold); } @@ -505,7 +505,7 @@ public class TestSimilarityBase extends LuceneTestCase { public void testIFB() throws IOException { SimilarityBase sim = new DFRSimilarity( new BasicModelIF(), new AfterEffectB(), new Normalization.NoNormalization()); - float B = (TOTAL_TERM_FREQ + 1) / (DOC_FREQ * (FREQ + 1)); // 0.8875 + float B = (TOTAL_TERM_FREQ + 1 + 1) / ((DOC_FREQ + 1) * (FREQ + 1)); // 0.8875 float IF = (float)(FREQ * SimilarityBase.log2( // 8.97759389642 1 + (NUMBER_OF_DOCUMENTS + 1) / (TOTAL_TERM_FREQ + 0.5))); float gold = B * IF; // 7.96761458307