LUCENE-5489: add Rescorer/QueryRescorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1579911 13f79535-47bb-0310-9956-ffa450edef68
2014-03-21 10:08:37 +00:00 · 2014-03-21 10:08:37 +00:00 · d3cff1ae8d
parent 54b06fdd42
commit d3cff1ae8d
3 changed files with 572 additions and 0 deletions
--- a/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java
@ -0,0 +1,238 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.util.Bits;
+
+// TODO: we could also have an ExpressionRescorer
+
+/** A {@link Rescorer} that uses a provided Query to assign
+ *  scores to the first-pass hits.
+ *
+ * @lucene.experimental */
+public abstract class QueryRescorer extends Rescorer {
+
+  private final Query query;
+
+  /** Sole constructor, passing the 2nd pass query to
+   *  assign scores to the 1st pass hits.  */
+  public QueryRescorer(Query query) {
+    this.query = query;
+  }
+
+  /**
+   * Implement this in a subclass to combine the first pass and
+   * second pass scores.  If secondPassMatches is false then
+   * the second pass query failed to match a hit from the
+   * first pass query, and you should ignore the
+   * secondPassScore.
+   */
+  protected abstract float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore);
+
+  @Override
+  public TopDocs rescore(IndexSearcher searcher, TopDocs topDocs, int topN) throws IOException {
+    int[] docIDs = new int[topDocs.scoreDocs.length];
+    for(int i=0;i<docIDs.length;i++) {
+      docIDs[i] = topDocs.scoreDocs[i].doc;
+    }
+
+    TopDocs topDocs2 = searcher.search(query, new OnlyDocIDsFilter(docIDs), topDocs.scoreDocs.length);
+
+    // TODO: we could save small young GC cost here if we
+    // cloned the incoming ScoreDoc[], sorted that by doc,
+    // passed that to OnlyDocIDsFilter, sorted 2nd pass
+    // TopDocs by doc, did a merge sort to combine the
+    // scores, and finally re-sorted by the combined score,
+    // but that is sizable added code complexity for minor
+    // GC savings:
+    Map<Integer,Float> newScores = new HashMap<Integer,Float>();
+    for(ScoreDoc sd : topDocs2.scoreDocs) {
+      newScores.put(sd.doc, sd.score);
+    }
+
+    ScoreDoc[] newHits = new ScoreDoc[topDocs.scoreDocs.length];
+    for(int i=0;i<topDocs.scoreDocs.length;i++) {
+      ScoreDoc sd = topDocs.scoreDocs[i];
+      Float newScore = newScores.get(sd.doc);
+      float combinedScore;
+      if (newScore == null) {
+        combinedScore = combine(sd.score, false, 0.0f);
+      } else {
+        combinedScore = combine(sd.score, true, newScore.floatValue());
+      }
+      newHits[i] = new ScoreDoc(sd.doc, combinedScore);
+    }
+
+    // TODO: we should do a partial sort (of only topN)
+    // instead, but typically the number of hits is
+    // smallish:
+    Arrays.sort(newHits,
+                new Comparator<ScoreDoc>() {
+                  @Override
+                  public int compare(ScoreDoc a, ScoreDoc b) {
+                    // Sort by score descending, then docID ascending:
+                    if (a.score > b.score) {
+                      return -1;
+                    } else if (a.score < b.score) {
+                      return 1;
+                    } else {
+                      // This subtraction can't overflow int
+                      // because docIDs are >= 0:
+                      return a.doc - b.doc;
+                    }
+                  }
+                });
+
+    if (topN < newHits.length) {
+      ScoreDoc[] subset = new ScoreDoc[topN];
+      System.arraycopy(newHits, 0, subset, 0, topN);
+      newHits = subset;
+    }
+
+    return new TopDocs(topDocs.totalHits, newHits, newHits[0].score);
+  }
+
+  @Override
+  public Explanation explain(IndexSearcher searcher, Explanation firstPassExplanation, int docID) throws IOException {
+    Explanation secondPassExplanation = searcher.explain(query, docID);
+
+    Float secondPassScore = secondPassExplanation.isMatch() ? secondPassExplanation.getValue() : null;
+
+    float score;
+    if (secondPassScore == null) {
+      score = combine(firstPassExplanation.getValue(), false, 0.0f);
+    } else {
+      score = combine(firstPassExplanation.getValue(), true,  secondPassScore.floatValue());
+    }
+
+    Explanation result = new Explanation(score, "combined first and second pass score using " + getClass());
+
+    Explanation first = new Explanation(firstPassExplanation.getValue(), "first pass score");
+    first.addDetail(firstPassExplanation);
+    result.addDetail(first);
+
+    Explanation second;
+    if (secondPassScore == null) {
+      second = new Explanation(0.0f, "no second pass score");
+    } else {
+      second = new Explanation(secondPassScore, "second pass score");
+    }
+    second.addDetail(secondPassExplanation);
+    result.addDetail(second);
+
+    return result;
+  }
+
+  /** Sugar API, calling {#rescore} using a simple linear
+   *  combination of firstPassScore + weight * secondPassScore */
+  public static TopDocs rescore(IndexSearcher searcher, TopDocs topDocs, Query query, final double weight, int topN) throws IOException {
+    return new QueryRescorer(query) {
+      @Override
+      protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) {
+        float score = firstPassScore;
+        if (secondPassMatches) {
+          score += weight * secondPassScore;
+        }
+        return score;
+      }
+    }.rescore(searcher, topDocs, topN);
+  }
+
+  /** Filter accepting only the specified docIDs */
+  private static class OnlyDocIDsFilter extends Filter {
+
+    private final int[] docIDs;
+
+    /** Sole constructor. */
+    public OnlyDocIDsFilter(int[] docIDs) {
+      this.docIDs = docIDs;
+      Arrays.sort(docIDs);
+    }
+
+    @Override
+    public DocIdSet getDocIdSet(final AtomicReaderContext context, final Bits acceptDocs) throws IOException {
+      int loc = Arrays.binarySearch(docIDs, context.docBase);
+      if (loc < 0) {
+        loc = -loc-1;
+      }
+
+      final int startLoc = loc;
+      final int endDoc = context.docBase + context.reader().maxDoc();
+
+      return new DocIdSet() {
+
+        int pos = startLoc;
+
+        @Override
+        public DocIdSetIterator iterator() throws IOException {
+          return new DocIdSetIterator() {
+
+            int docID;
+
+            @Override
+            public int docID() {
+              return docID;
+            }
+
+            @Override
+            public int nextDoc() {
+              if (pos == docIDs.length) {
+                return NO_MORE_DOCS;
+              }
+              int docID = docIDs[pos];
+              if (docID >= endDoc) {
+                return NO_MORE_DOCS;
+              }
+              pos++;
+              assert acceptDocs == null || acceptDocs.get(docID-context.docBase);
+              return docID-context.docBase;
+            }
+
+            @Override
+            public long cost() {
+              // NOTE: not quite right, since this is cost
+              // across all segments, and we are supposed to
+              // return cost for just this segment:
+              return docIDs.length;
+            }
+
+            @Override
+            public int advance(int target) {
+              // TODO: this is a full binary search; we
+              // could optimize (a bit) by setting lower
+              // bound to current pos instead:
+              int loc = Arrays.binarySearch(docIDs, target + context.docBase);
+              if (loc < 0) {
+                loc = -loc-1;
+              }
+              pos = loc;
+              return nextDoc();
+            }
+          };
+        }
+      };
+    }
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/search/Rescorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/Rescorer.java
@ -0,0 +1,58 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/**
+ * Re-scores the topN results ({@link TopDocs}) from an original
+ * query.  See {@link QueryRescorer} for an actual
+ * implementation.  Typically, you run a low-cost
+ * first-pass query across the entire index, collecting the
+ * top few hundred hits perhaps, and then use this class to
+ * mix in a more costly second pass scoring.
+ *
+ * <p>See {@link
+ * QueryRescorer#rescore(IndexSearcher,TopDocs,Query,double,int)}
+ * for a simple static method to call to rescore using a 2nd
+ * pass {@link Query}.
+ *
+ * @lucene.experimental
+ */
+
+public abstract class Rescorer {
+
+  /** 
+   * Rescore an initial first-pass {@link TopDocs}.
+   *
+   * @param searcher {@link IndexSearcher} used to produce the
+   *   first pass topDocs
+   * @param firstPassTopDocs Hits from the first pass
+   *   search.  It's very important that these hits were
+   *   produced by the provided searcher; otherwise the doc
+   *   IDs will not match!
+   * @param topN How many re-scored hits to return
+   */ 
+  public abstract TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int topN) throws IOException;
+
+  /**
+   * Explains how the score for the specified document was
+   * computed.
+   */
+  public abstract Explanation explain(IndexSearcher searcher, Explanation firstPassExplanation, int docID) throws IOException;
+}
--- a/lucene/core/src/test/org/apache/lucene/search/TestQueryRescorer.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestQueryRescorer.java
@ -0,0 +1,276 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestQueryRescorer extends LuceneTestCase {
+
+  public void testBasic() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    Document doc = new Document();
+    doc.add(newStringField("id", "0", Field.Store.YES));
+    doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
+    w.addDocument(doc);
+    doc = new Document();
+    doc.add(newStringField("id", "1", Field.Store.YES));
+    // 1 extra token, but wizard and oz are close;
+    doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
+    w.addDocument(doc);
+    IndexReader r = w.getReader();
+    w.close();
+
+    // Do ordinary BooleanQuery:
+    BooleanQuery bq = new BooleanQuery();
+    bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
+    bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
+    IndexSearcher searcher = newSearcher(r);
+
+    TopDocs hits = searcher.search(bq, 10);
+    assertEquals(2, hits.totalHits);
+    assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
+    assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
+
+    // Now, resort using PhraseQuery:
+    PhraseQuery pq = new PhraseQuery();
+    pq.setSlop(5);
+    pq.add(new Term("field", "wizard"));
+    pq.add(new Term("field", "oz"));
+
+    TopDocs hits2 = QueryRescorer.rescore(searcher, hits, pq, 2.0, 10);
+
+    // Resorting changed the order:
+    assertEquals(2, hits2.totalHits);
+    assertEquals("1", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
+    assertEquals("0", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
+
+    // Resort using SpanNearQuery:
+    SpanTermQuery t1 = new SpanTermQuery(new Term("field", "wizard"));
+    SpanTermQuery t2 = new SpanTermQuery(new Term("field", "oz"));
+    SpanNearQuery snq = new SpanNearQuery(new SpanQuery[] {t1, t2}, 0, true);
+
+    TopDocs hits3 = QueryRescorer.rescore(searcher, hits, snq, 2.0, 10);
+
+    // Resorting changed the order:
+    assertEquals(2, hits3.totalHits);
+    assertEquals("1", searcher.doc(hits3.scoreDocs[0].doc).get("id"));
+    assertEquals("0", searcher.doc(hits3.scoreDocs[1].doc).get("id"));
+
+    r.close();
+    dir.close();
+  }
+
+  public void testCustomCombine() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    Document doc = new Document();
+    doc.add(newStringField("id", "0", Field.Store.YES));
+    doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
+    w.addDocument(doc);
+    doc = new Document();
+    doc.add(newStringField("id", "1", Field.Store.YES));
+    // 1 extra token, but wizard and oz are close;
+    doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
+    w.addDocument(doc);
+    IndexReader r = w.getReader();
+    w.close();
+
+    // Do ordinary BooleanQuery:
+    BooleanQuery bq = new BooleanQuery();
+    bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
+    bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
+    IndexSearcher searcher = newSearcher(r);
+
+    TopDocs hits = searcher.search(bq, 10);
+    assertEquals(2, hits.totalHits);
+    assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
+    assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
+
+    // Now, resort using PhraseQuery, but with an
+    // opposite-world combine:
+    PhraseQuery pq = new PhraseQuery();
+    pq.setSlop(5);
+    pq.add(new Term("field", "wizard"));
+    pq.add(new Term("field", "oz"));
+    
+    TopDocs hits2 = new QueryRescorer(pq) {
+        @Override
+        protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) {
+          float score = firstPassScore;
+          if (secondPassMatches) {
+            score -= 2.0 * secondPassScore;
+          }
+          return score;
+        }
+      }.rescore(searcher, hits, 10);
+
+    // Resorting didn't change the order:
+    assertEquals(2, hits2.totalHits);
+    assertEquals("0", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
+    assertEquals("1", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
+
+    r.close();
+    dir.close();
+  }
+
+  public void testExplain() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    Document doc = new Document();
+    doc.add(newStringField("id", "0", Field.Store.YES));
+    doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
+    w.addDocument(doc);
+    doc = new Document();
+    doc.add(newStringField("id", "1", Field.Store.YES));
+    // 1 extra token, but wizard and oz are close;
+    doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
+    w.addDocument(doc);
+    IndexReader r = w.getReader();
+    w.close();
+
+    // Do ordinary BooleanQuery:
+    BooleanQuery bq = new BooleanQuery();
+    bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
+    bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
+    IndexSearcher searcher = newSearcher(r);
+
+    TopDocs hits = searcher.search(bq, 10);
+    assertEquals(2, hits.totalHits);
+    assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
+    assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
+
+    // Now, resort using PhraseQuery:
+    PhraseQuery pq = new PhraseQuery();
+    pq.add(new Term("field", "wizard"));
+    pq.add(new Term("field", "oz"));
+
+    Rescorer rescorer = new QueryRescorer(pq) {
+        @Override
+        protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) {
+          float score = firstPassScore;
+          if (secondPassMatches) {
+            score += 2.0 * secondPassScore;
+          }
+          return score;
+        }
+      };
+
+    TopDocs hits2 = rescorer.rescore(searcher, hits, 10);
+
+    // Resorting changed the order:
+    assertEquals(2, hits2.totalHits);
+    assertEquals("1", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
+    assertEquals("0", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
+
+    int docID = hits2.scoreDocs[0].doc;
+    Explanation explain = rescorer.explain(searcher,
+                                           searcher.explain(bq, docID),
+                                           docID);
+    String s = explain.toString();
+    assertTrue(s.contains("TestQueryRescorer$"));
+    assertTrue(s.contains("combined first and second pass score"));
+    assertTrue(s.contains("first pass score"));
+    assertTrue(s.contains("= second pass score"));
+    assertEquals(hits2.scoreDocs[0].score, explain.getValue(), 0.0f);
+
+    docID = hits2.scoreDocs[1].doc;
+    explain = rescorer.explain(searcher,
+                               searcher.explain(bq, docID),
+                               docID);
+    s = explain.toString();
+    assertTrue(s.contains("TestQueryRescorer$"));
+    assertTrue(s.contains("combined first and second pass score"));
+    assertTrue(s.contains("first pass score"));
+    assertTrue(s.contains("no second pass score"));
+    assertFalse(s.contains("= second pass score"));
+    assertTrue(s.contains("NON-MATCH"));
+    assertEquals(hits2.scoreDocs[1].score, explain.getValue(), 0.0f);
+
+    r.close();
+    dir.close();
+  }
+
+  public void testMissingSecondPassScore() throws Exception {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+    Document doc = new Document();
+    doc.add(newStringField("id", "0", Field.Store.YES));
+    doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
+    w.addDocument(doc);
+    doc = new Document();
+    doc.add(newStringField("id", "1", Field.Store.YES));
+    // 1 extra token, but wizard and oz are close;
+    doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
+    w.addDocument(doc);
+    IndexReader r = w.getReader();
+    w.close();
+
+    // Do ordinary BooleanQuery:
+    BooleanQuery bq = new BooleanQuery();
+    bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
+    bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
+    IndexSearcher searcher = newSearcher(r);
+
+    TopDocs hits = searcher.search(bq, 10);
+    assertEquals(2, hits.totalHits);
+    assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
+    assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
+
+    // Now, resort using PhraseQuery, no slop:
+    PhraseQuery pq = new PhraseQuery();
+    pq.add(new Term("field", "wizard"));
+    pq.add(new Term("field", "oz"));
+
+    TopDocs hits2 = QueryRescorer.rescore(searcher, hits, pq, 2.0, 10);
+
+    // Resorting changed the order:
+    assertEquals(2, hits2.totalHits);
+    assertEquals("1", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
+    assertEquals("0", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
+
+    // Resort using SpanNearQuery:
+    SpanTermQuery t1 = new SpanTermQuery(new Term("field", "wizard"));
+    SpanTermQuery t2 = new SpanTermQuery(new Term("field", "oz"));
+    SpanNearQuery snq = new SpanNearQuery(new SpanQuery[] {t1, t2}, 0, true);
+
+    TopDocs hits3 = QueryRescorer.rescore(searcher, hits, snq, 2.0, 10);
+
+    // Resorting changed the order:
+    assertEquals(2, hits3.totalHits);
+    assertEquals("1", searcher.doc(hits3.scoreDocs[0].doc).get("id"));
+    assertEquals("0", searcher.doc(hits3.scoreDocs[1].doc).get("id"));
+
+    r.close();
+    dir.close();
+  }
+}