LUCENE-5867: Add a BooleanSimilarity.

2016-11-10 14:05:13 +01:00 · 2016-11-10 14:05:13 +01:00 · 3e15233b23
parent c415bc8d1d
commit 3e15233b23
5 changed files with 218 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -55,7 +55,10 @@ Other
 * LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)

 ======================= Lucene 6.4.0 =======================
-(No Changes)
+
+New features
+
+* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)

 Improvements

--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Simple similarity that gives terms a score that is equal to their query
+ * boost. This similarity is typically used with disabled norms since neither
+ * document statistics nor index statistics are used for scoring. That said,
+ * if norms are enabled, they will be computed the same way as
+ * {@link SimilarityBase} and {@link BM25Similarity} with
+ * {@link SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps}
+ * so that the {@link Similarity} can be changed after the index has been
+ * created.
+ */
+public class BooleanSimilarity extends Similarity {
+
+  private static final Similarity BM25_SIM = new BM25Similarity();
+
+  /** Sole constructor */
+  public BooleanSimilarity() {}
+
+  @Override
+  public long computeNorm(FieldInvertState state) {
+    return BM25_SIM.computeNorm(state);
+  }
+
+  @Override
+  public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+    return new BooleanWeight(boost);
+  }
+
+  private static class BooleanWeight extends SimWeight {
+    final float boost;
+
+    BooleanWeight(float boost) {
+      this.boost = boost;
+    }
+  }
+
+  @Override
+  public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
+    final float boost = ((BooleanWeight) weight).boost;
+
+    return new SimScorer() {
+
+      @Override
+      public float score(int doc, float freq) throws IOException {
+        return boost;
+      }
+
+      @Override
+      public Explanation explain(int doc, Explanation freq) throws IOException {
+        Explanation queryBoostExpl = Explanation.match(boost, "query boost");
+        return Explanation.match(
+            queryBoostExpl.getValue(),
+            "score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
+            queryBoostExpl);
+      }
+
+      @Override
+      public float computeSlopFactor(int distance) {
+        return 1f;
+      }
+
+      @Override
+      public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+        return 1f;
+      }
+    };
+  }
+
+}
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java
@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestBooleanSimilarity extends LuceneTestCase {
+
+  public void testTermScoreIsEqualToBoost() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+        newIndexWriterConfig());
+    Document doc = new Document();
+    doc.add(new StringField("foo", "bar", Store.NO));
+    doc.add(new StringField("foo", "baz", Store.NO));
+    w.addDocument(doc);
+    doc = new Document();
+    doc.add(new StringField("foo", "bar", Store.NO));
+    doc.add(new StringField("foo", "bar", Store.NO));
+    w.addDocument(doc);
+
+    DirectoryReader reader = w.getReader();
+    w.close();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(new BooleanSimilarity());
+    TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "bar")), 2);
+    assertEquals(2, topDocs.totalHits);
+    assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+    assertEquals(1f, topDocs.scoreDocs[1].score, 0f);
+
+    topDocs = searcher.search(new TermQuery(new Term("foo", "baz")), 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+
+    topDocs = searcher.search(new BoostQuery(new TermQuery(new Term("foo", "baz")), 3f), 1);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(3f, topDocs.scoreDocs[0].score, 0f);
+
+    reader.close();
+    dir.close();
+  }
+
+  public void testPhraseScoreIsEqualToBoost() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir,
+        newIndexWriterConfig().setSimilarity(new BooleanSimilarity()));
+    Document doc = new Document();
+    doc.add(new TextField("foo", "bar baz quux", Store.NO));
+    w.addDocument(doc);
+
+    DirectoryReader reader = w.getReader();
+    w.close();
+    IndexSearcher searcher = newSearcher(reader);
+    searcher.setSimilarity(new BooleanSimilarity());
+
+    PhraseQuery query = new PhraseQuery(2, "foo", "bar", "quux");
+
+    TopDocs topDocs = searcher.search(query, 2);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
+
+    topDocs = searcher.search(new BoostQuery(query, 7), 2);
+    assertEquals(1, topDocs.totalHits);
+    assertEquals(7f, topDocs.scoreDocs[0].score, 0f);
+
+    reader.close();
+    dir.close();
+  }
+
+  public void testSameNormsAsBM25() {
+    BooleanSimilarity sim1 = new BooleanSimilarity();
+    BM25Similarity sim2 = new BM25Similarity();
+    sim2.setDiscountOverlaps(true);
+    for (int iter = 0; iter < 100; ++iter) {
+      final int length = TestUtil.nextInt(random(), 1, 100);
+      final int position = random().nextInt(length);
+      final int numOverlaps = random().nextInt(50);
+      final float boost = random().nextFloat() * 10;
+      FieldInvertState state = new FieldInvertState("foo", position, length, numOverlaps, 100, boost);
+      assertEquals(
+          sim2.computeNorm(state),
+          sim1.computeNorm(state),
+          0f);
+    }
+  }
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java
@ -91,6 +91,7 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
    allSims = new ArrayList<>();
    allSims.add(new ClassicSimilarity());
    allSims.add(new BM25Similarity());
+    allSims.add(new BooleanSimilarity());
    for (BasicModel basicModel : BASIC_MODELS) {
      for (AfterEffect afterEffect : AFTER_EFFECTS) {
        for (Normalization normalization : NORMALIZATIONS) {
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
@ -36,7 +36,6 @@ import org.apache.lucene.codecs.lucene70.Lucene70Codec;
 import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
 import org.apache.lucene.index.RandomCodec;
-import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.search.similarities.RandomSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@ -213,7 +212,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
    TimeZone randomTimeZone = randomTimeZone(random());
    timeZone = testTimeZone.equals("random") ? randomTimeZone : TimeZone.getTimeZone(testTimeZone);
    TimeZone.setDefault(timeZone);
-    similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarity(random());
+    similarity = new RandomSimilarity(random());

    // Check codec restrictions once at class level.
    try {