LUCENE-5489: add Rescorer/QueryRescorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1579911 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-03-21 10:08:37 +00:00
parent 54b06fdd42
commit d3cff1ae8d
3 changed files with 572 additions and 0 deletions

View File

@ -0,0 +1,238 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.util.Bits;
// TODO: we could also have an ExpressionRescorer
/** A {@link Rescorer} that uses a provided Query to assign
* scores to the first-pass hits.
*
* @lucene.experimental */
public abstract class QueryRescorer extends Rescorer {
private final Query query;
/** Sole constructor, passing the 2nd pass query to
* assign scores to the 1st pass hits. */
public QueryRescorer(Query query) {
this.query = query;
}
/**
* Implement this in a subclass to combine the first pass and
* second pass scores. If secondPassMatches is false then
* the second pass query failed to match a hit from the
* first pass query, and you should ignore the
* secondPassScore.
*/
protected abstract float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore);
@Override
public TopDocs rescore(IndexSearcher searcher, TopDocs topDocs, int topN) throws IOException {
int[] docIDs = new int[topDocs.scoreDocs.length];
for(int i=0;i<docIDs.length;i++) {
docIDs[i] = topDocs.scoreDocs[i].doc;
}
TopDocs topDocs2 = searcher.search(query, new OnlyDocIDsFilter(docIDs), topDocs.scoreDocs.length);
// TODO: we could save small young GC cost here if we
// cloned the incoming ScoreDoc[], sorted that by doc,
// passed that to OnlyDocIDsFilter, sorted 2nd pass
// TopDocs by doc, did a merge sort to combine the
// scores, and finally re-sorted by the combined score,
// but that is sizable added code complexity for minor
// GC savings:
Map<Integer,Float> newScores = new HashMap<Integer,Float>();
for(ScoreDoc sd : topDocs2.scoreDocs) {
newScores.put(sd.doc, sd.score);
}
ScoreDoc[] newHits = new ScoreDoc[topDocs.scoreDocs.length];
for(int i=0;i<topDocs.scoreDocs.length;i++) {
ScoreDoc sd = topDocs.scoreDocs[i];
Float newScore = newScores.get(sd.doc);
float combinedScore;
if (newScore == null) {
combinedScore = combine(sd.score, false, 0.0f);
} else {
combinedScore = combine(sd.score, true, newScore.floatValue());
}
newHits[i] = new ScoreDoc(sd.doc, combinedScore);
}
// TODO: we should do a partial sort (of only topN)
// instead, but typically the number of hits is
// smallish:
Arrays.sort(newHits,
new Comparator<ScoreDoc>() {
@Override
public int compare(ScoreDoc a, ScoreDoc b) {
// Sort by score descending, then docID ascending:
if (a.score > b.score) {
return -1;
} else if (a.score < b.score) {
return 1;
} else {
// This subtraction can't overflow int
// because docIDs are >= 0:
return a.doc - b.doc;
}
}
});
if (topN < newHits.length) {
ScoreDoc[] subset = new ScoreDoc[topN];
System.arraycopy(newHits, 0, subset, 0, topN);
newHits = subset;
}
return new TopDocs(topDocs.totalHits, newHits, newHits[0].score);
}
@Override
public Explanation explain(IndexSearcher searcher, Explanation firstPassExplanation, int docID) throws IOException {
Explanation secondPassExplanation = searcher.explain(query, docID);
Float secondPassScore = secondPassExplanation.isMatch() ? secondPassExplanation.getValue() : null;
float score;
if (secondPassScore == null) {
score = combine(firstPassExplanation.getValue(), false, 0.0f);
} else {
score = combine(firstPassExplanation.getValue(), true, secondPassScore.floatValue());
}
Explanation result = new Explanation(score, "combined first and second pass score using " + getClass());
Explanation first = new Explanation(firstPassExplanation.getValue(), "first pass score");
first.addDetail(firstPassExplanation);
result.addDetail(first);
Explanation second;
if (secondPassScore == null) {
second = new Explanation(0.0f, "no second pass score");
} else {
second = new Explanation(secondPassScore, "second pass score");
}
second.addDetail(secondPassExplanation);
result.addDetail(second);
return result;
}
/** Sugar API, calling {#rescore} using a simple linear
* combination of firstPassScore + weight * secondPassScore */
public static TopDocs rescore(IndexSearcher searcher, TopDocs topDocs, Query query, final double weight, int topN) throws IOException {
return new QueryRescorer(query) {
@Override
protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) {
float score = firstPassScore;
if (secondPassMatches) {
score += weight * secondPassScore;
}
return score;
}
}.rescore(searcher, topDocs, topN);
}
/** Filter accepting only the specified docIDs */
private static class OnlyDocIDsFilter extends Filter {
private final int[] docIDs;
/** Sole constructor. */
public OnlyDocIDsFilter(int[] docIDs) {
this.docIDs = docIDs;
Arrays.sort(docIDs);
}
@Override
public DocIdSet getDocIdSet(final AtomicReaderContext context, final Bits acceptDocs) throws IOException {
int loc = Arrays.binarySearch(docIDs, context.docBase);
if (loc < 0) {
loc = -loc-1;
}
final int startLoc = loc;
final int endDoc = context.docBase + context.reader().maxDoc();
return new DocIdSet() {
int pos = startLoc;
@Override
public DocIdSetIterator iterator() throws IOException {
return new DocIdSetIterator() {
int docID;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
if (pos == docIDs.length) {
return NO_MORE_DOCS;
}
int docID = docIDs[pos];
if (docID >= endDoc) {
return NO_MORE_DOCS;
}
pos++;
assert acceptDocs == null || acceptDocs.get(docID-context.docBase);
return docID-context.docBase;
}
@Override
public long cost() {
// NOTE: not quite right, since this is cost
// across all segments, and we are supposed to
// return cost for just this segment:
return docIDs.length;
}
@Override
public int advance(int target) {
// TODO: this is a full binary search; we
// could optimize (a bit) by setting lower
// bound to current pos instead:
int loc = Arrays.binarySearch(docIDs, target + context.docBase);
if (loc < 0) {
loc = -loc-1;
}
pos = loc;
return nextDoc();
}
};
}
};
}
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* Re-scores the topN results ({@link TopDocs}) from an original
* query. See {@link QueryRescorer} for an actual
* implementation. Typically, you run a low-cost
* first-pass query across the entire index, collecting the
* top few hundred hits perhaps, and then use this class to
* mix in a more costly second pass scoring.
*
* <p>See {@link
* QueryRescorer#rescore(IndexSearcher,TopDocs,Query,double,int)}
* for a simple static method to call to rescore using a 2nd
* pass {@link Query}.
*
* @lucene.experimental
*/
public abstract class Rescorer {
/**
* Rescore an initial first-pass {@link TopDocs}.
*
* @param searcher {@link IndexSearcher} used to produce the
* first pass topDocs
* @param firstPassTopDocs Hits from the first pass
* search. It's very important that these hits were
* produced by the provided searcher; otherwise the doc
* IDs will not match!
* @param topN How many re-scored hits to return
*/
public abstract TopDocs rescore(IndexSearcher searcher, TopDocs firstPassTopDocs, int topN) throws IOException;
/**
* Explains how the score for the specified document was
* computed.
*/
public abstract Explanation explain(IndexSearcher searcher, Explanation firstPassExplanation, int docID) throws IOException;
}

View File

@ -0,0 +1,276 @@
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
public class TestQueryRescorer extends LuceneTestCase {
public void testBasic() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("id", "1", Field.Store.YES));
// 1 extra token, but wizard and oz are close;
doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
// Do ordinary BooleanQuery:
BooleanQuery bq = new BooleanQuery();
bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
IndexSearcher searcher = newSearcher(r);
TopDocs hits = searcher.search(bq, 10);
assertEquals(2, hits.totalHits);
assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
// Now, resort using PhraseQuery:
PhraseQuery pq = new PhraseQuery();
pq.setSlop(5);
pq.add(new Term("field", "wizard"));
pq.add(new Term("field", "oz"));
TopDocs hits2 = QueryRescorer.rescore(searcher, hits, pq, 2.0, 10);
// Resorting changed the order:
assertEquals(2, hits2.totalHits);
assertEquals("1", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
assertEquals("0", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
// Resort using SpanNearQuery:
SpanTermQuery t1 = new SpanTermQuery(new Term("field", "wizard"));
SpanTermQuery t2 = new SpanTermQuery(new Term("field", "oz"));
SpanNearQuery snq = new SpanNearQuery(new SpanQuery[] {t1, t2}, 0, true);
TopDocs hits3 = QueryRescorer.rescore(searcher, hits, snq, 2.0, 10);
// Resorting changed the order:
assertEquals(2, hits3.totalHits);
assertEquals("1", searcher.doc(hits3.scoreDocs[0].doc).get("id"));
assertEquals("0", searcher.doc(hits3.scoreDocs[1].doc).get("id"));
r.close();
dir.close();
}
public void testCustomCombine() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("id", "1", Field.Store.YES));
// 1 extra token, but wizard and oz are close;
doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
// Do ordinary BooleanQuery:
BooleanQuery bq = new BooleanQuery();
bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
IndexSearcher searcher = newSearcher(r);
TopDocs hits = searcher.search(bq, 10);
assertEquals(2, hits.totalHits);
assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
// Now, resort using PhraseQuery, but with an
// opposite-world combine:
PhraseQuery pq = new PhraseQuery();
pq.setSlop(5);
pq.add(new Term("field", "wizard"));
pq.add(new Term("field", "oz"));
TopDocs hits2 = new QueryRescorer(pq) {
@Override
protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) {
float score = firstPassScore;
if (secondPassMatches) {
score -= 2.0 * secondPassScore;
}
return score;
}
}.rescore(searcher, hits, 10);
// Resorting didn't change the order:
assertEquals(2, hits2.totalHits);
assertEquals("0", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
assertEquals("1", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
r.close();
dir.close();
}
public void testExplain() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("id", "1", Field.Store.YES));
// 1 extra token, but wizard and oz are close;
doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
// Do ordinary BooleanQuery:
BooleanQuery bq = new BooleanQuery();
bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
IndexSearcher searcher = newSearcher(r);
TopDocs hits = searcher.search(bq, 10);
assertEquals(2, hits.totalHits);
assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
// Now, resort using PhraseQuery:
PhraseQuery pq = new PhraseQuery();
pq.add(new Term("field", "wizard"));
pq.add(new Term("field", "oz"));
Rescorer rescorer = new QueryRescorer(pq) {
@Override
protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) {
float score = firstPassScore;
if (secondPassMatches) {
score += 2.0 * secondPassScore;
}
return score;
}
};
TopDocs hits2 = rescorer.rescore(searcher, hits, 10);
// Resorting changed the order:
assertEquals(2, hits2.totalHits);
assertEquals("1", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
assertEquals("0", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
int docID = hits2.scoreDocs[0].doc;
Explanation explain = rescorer.explain(searcher,
searcher.explain(bq, docID),
docID);
String s = explain.toString();
assertTrue(s.contains("TestQueryRescorer$"));
assertTrue(s.contains("combined first and second pass score"));
assertTrue(s.contains("first pass score"));
assertTrue(s.contains("= second pass score"));
assertEquals(hits2.scoreDocs[0].score, explain.getValue(), 0.0f);
docID = hits2.scoreDocs[1].doc;
explain = rescorer.explain(searcher,
searcher.explain(bq, docID),
docID);
s = explain.toString();
assertTrue(s.contains("TestQueryRescorer$"));
assertTrue(s.contains("combined first and second pass score"));
assertTrue(s.contains("first pass score"));
assertTrue(s.contains("no second pass score"));
assertFalse(s.contains("= second pass score"));
assertTrue(s.contains("NON-MATCH"));
assertEquals(hits2.scoreDocs[1].score, explain.getValue(), 0.0f);
r.close();
dir.close();
}
public void testMissingSecondPassScore() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("id", "0", Field.Store.YES));
doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(newStringField("id", "1", Field.Store.YES));
// 1 extra token, but wizard and oz are close;
doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
// Do ordinary BooleanQuery:
BooleanQuery bq = new BooleanQuery();
bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
IndexSearcher searcher = newSearcher(r);
TopDocs hits = searcher.search(bq, 10);
assertEquals(2, hits.totalHits);
assertEquals("0", searcher.doc(hits.scoreDocs[0].doc).get("id"));
assertEquals("1", searcher.doc(hits.scoreDocs[1].doc).get("id"));
// Now, resort using PhraseQuery, no slop:
PhraseQuery pq = new PhraseQuery();
pq.add(new Term("field", "wizard"));
pq.add(new Term("field", "oz"));
TopDocs hits2 = QueryRescorer.rescore(searcher, hits, pq, 2.0, 10);
// Resorting changed the order:
assertEquals(2, hits2.totalHits);
assertEquals("1", searcher.doc(hits2.scoreDocs[0].doc).get("id"));
assertEquals("0", searcher.doc(hits2.scoreDocs[1].doc).get("id"));
// Resort using SpanNearQuery:
SpanTermQuery t1 = new SpanTermQuery(new Term("field", "wizard"));
SpanTermQuery t2 = new SpanTermQuery(new Term("field", "oz"));
SpanNearQuery snq = new SpanNearQuery(new SpanQuery[] {t1, t2}, 0, true);
TopDocs hits3 = QueryRescorer.rescore(searcher, hits, snq, 2.0, 10);
// Resorting changed the order:
assertEquals(2, hits3.totalHits);
assertEquals("1", searcher.doc(hits3.scoreDocs[0].doc).get("id"));
assertEquals("0", searcher.doc(hits3.scoreDocs[1].doc).get("id"));
r.close();
dir.close();
}
}