LUCENE-1341: Added BoostingNearQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@801667 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2009-08-06 15:14:18 +00:00
parent fc369cdf2f
commit d4eeefff81
6 changed files with 392 additions and 4 deletions

View File

@ -662,6 +662,9 @@ New features
the javadocs implied. If you have payloads and want to use an ordered
SpanNearQuery that does not need to use the payloads, you can
disable loading them with a new constructor switch. (Mark Miller)
34. LUCENE-1341: Added BoostingNearQuery to enable SpanNearQuery functionality
with payloads (Peter Keegan, Grant Ingersoll)
Optimizations

View File

@ -0,0 +1,151 @@
package org.apache.lucene.search.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.QueryWeight;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered;
import java.io.IOException;
import java.util.Iterator;
import java.util.Collection;
/**
* The BoostingNearQuery is very similar to the {@link org.apache.lucene.search.spans.SpanNearQuery} except
* that it factors in the value of the payloads located at each of the positions where the
* {@link org.apache.lucene.search.spans.TermSpans} occurs.
* <p>
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)}
* which returns 1 by default.
* <p>
* Payload scores are averaged across term occurrences in the document.
*
* @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int)
*/
public class BoostingNearQuery extends SpanNearQuery {
String fieldName;
public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) {
super(clauses, slop, inOrder);
fieldName = clauses[0].getField(); // all clauses must have same field
}
public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
return new BoostingSpanWeight(this, searcher);
}
public class BoostingSpanWeight extends SpanWeight {
public BoostingSpanWeight(SpanQuery query, Searcher searcher) throws IOException {
super(query, searcher);
}
public Scorer scorer(IndexReader reader) throws IOException {
return new BoostingSpanScorer(query.getSpans(reader), this,
similarity,
reader.norms(query.getField()));
}
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
return new BoostingSpanScorer(query.getSpans(reader), this,
similarity,
reader.norms(query.getField()));
}
}
public class BoostingSpanScorer extends SpanScorer {
Spans spans;
Spans[] subSpans = null;
protected float payloadScore;
private int payloadsSeen;
Similarity similarity = getSimilarity();
protected BoostingSpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
throws IOException {
super(spans, weight, similarity, norms);
this.spans = spans;
}
// Get the payloads associated with all underlying subspans
public void getPayloads(Spans[] subSpans) throws IOException {
for (int i = 0; i < subSpans.length; i++) {
if (subSpans[i] instanceof NearSpansOrdered) {
if (((NearSpansOrdered)subSpans[i]).isPayloadAvailable()) {
processPayloads(((NearSpansOrdered)subSpans[i]).getPayload());
}
getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans());
} else if (subSpans[i] instanceof NearSpansUnordered) {
if (((NearSpansUnordered)subSpans[i]).isPayloadAvailable()) {
processPayloads(((NearSpansUnordered)subSpans[i]).getPayload());
}
getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans());
}
}
}
/**
* By default, sums the payloads, but can be overridden to do other things.
* @param payLoads The payloads
*/
protected void processPayloads(Collection payLoads) {
for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) {
byte[] thePayload = (byte[]) iterator.next();
++payloadsSeen;
payloadScore += similarity.scorePayload(fieldName, thePayload, 0, thePayload.length);
}
}
//
protected boolean setFreqCurrentDoc() throws IOException {
Spans[] spansArr = new Spans[1];
spansArr[0] = spans;
payloadScore = 0;
payloadsSeen = 0;
getPayloads(spansArr);
return super.setFreqCurrentDoc();
}
public float score() throws IOException {
return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
}
public Explanation explain(int doc) throws IOException {
Explanation result = new Explanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
payloadBoost.setValue(avgPayloadScore);
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
result.setDescription("bnq, product of:");
return result;
}
}
}

View File

@ -46,8 +46,12 @@ import java.util.Set;
* matches twice:
* <pre>t1 t2 .. t3 </pre>
* <pre> t1 .. t2 t3</pre>
*
*
* Expert:
* Only public for subclassing. Most implementations should not need this class
*/
class NearSpansOrdered implements Spans {
public class NearSpansOrdered implements Spans {
private final int allowedSlop;
private boolean firstTime = true;
private boolean more = false;
@ -104,6 +108,10 @@ class NearSpansOrdered implements Spans {
// inherit javadocs
public int end() { return matchEnd; }
public Spans[] getSubSpans() {
return subSpans;
}
// TODO: Remove warning after API has been finalized
// TODO: Would be nice to be able to lazy load payloads

View File

@ -27,10 +27,15 @@ import java.util.List;
import java.util.Set;
import java.util.HashSet;
class NearSpansUnordered implements Spans {
/**
* Expert:
* Only public for subclassing. Most implementations should not need this class
*/
public class NearSpansUnordered implements Spans {
private SpanNearQuery query;
private List ordered = new ArrayList(); // spans in query order
private Spans[] subSpans;
private int slop; // from query
private SpansCell first; // linked list of spans
@ -122,13 +127,17 @@ class NearSpansUnordered implements Spans {
SpanQuery[] clauses = query.getClauses();
queue = new CellQueue(clauses.length);
subSpans = new Spans[clauses.length];
for (int i = 0; i < clauses.length; i++) {
SpansCell cell =
new SpansCell(clauses[i].getSpans(reader), i);
ordered.add(cell);
subSpans[i] = cell.spans;
}
}
public Spans[] getSubSpans() {
return subSpans;
}
public boolean next() throws IOException {
if (firstTime) {
initList(true);

View File

@ -38,7 +38,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
private int slop;
private boolean inOrder;
private String field;
protected String field;
private boolean collectPayloads;
/** Construct a SpanNearQuery. Matches spans matching a span from each

View File

@ -0,0 +1,217 @@
package org.apache.lucene.search.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
public class TestBoostingNearQuery extends LuceneTestCase {
private IndexSearcher searcher;
private BoostingSimilarity similarity = new BoostingSimilarity();
private byte[] payload2 = new byte[]{2};
private byte[] payload4 = new byte[]{4};
public TestBoostingNearQuery(String s) {
super(s);
}
private class PayloadAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new LowerCaseTokenizer(reader);
result = new PayloadFilter(result, fieldName);
return result;
}
}
private class PayloadFilter extends TokenFilter {
String fieldName;
int numSeen = 0;
protected PayloadAttribute payAtt;
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
this.fieldName = fieldName;
payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public boolean incrementToken() throws IOException {
boolean result = false;
if (input.incrementToken() == true){
if (numSeen % 2 == 0) {
payAtt.setPayload(new Payload(payload2));
} else {
payAtt.setPayload(new Payload(payload4));
}
numSeen++;
result = true;
}
return result;
}
}
private BoostingNearQuery newPhraseQuery (String fieldName, String phrase, boolean inOrder) {
int n;
String[] words = phrase.split("[\\s]+");
SpanQuery clauses[] = new SpanQuery[words.length];
for (int i=0;i<clauses.length;i++) {
clauses[i] = new BoostingTermQuery(new Term(fieldName, words[i]));
}
return new BoostingNearQuery(clauses, 0, inOrder);
}
protected void setUp() throws Exception {
super.setUp();
RAMDirectory directory = new RAMDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer
= new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
writer.setSimilarity(similarity);
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
searcher = new IndexSearcher(directory, true);
searcher.setSimilarity(similarity);
}
public void test() throws IOException {
BoostingNearQuery query;
TopDocs hits;
query = newPhraseQuery("field", "twenty two", true);
// all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
// and all the similarity factors are set to 1
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 10 hits", hits.totalHits == 10);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
}
for (int i=1;i<10;i++) {
query = newPhraseQuery("field", English.intToEnglish(i)+" hundred", true);
// all should have score = 3 because adjacent terms have payloads of 2,4
// and all the similarity factors are set to 1
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("should be 100 hits", hits.totalHits == 100);
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc doc = hits.scoreDocs[j];
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
}
}
}
public void testLongerSpan() throws IOException {
BoostingNearQuery query;
TopDocs hits;
query = newPhraseQuery("field", "nine hundred ninety nine", true);
hits = searcher.search(query, null, 100);
ScoreDoc doc = hits.scoreDocs[0];
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("there should only be one hit", hits.totalHits == 1);
// should have score = 3 because adjacent terms have payloads of 2,4
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
}
public void testComplexNested() throws IOException {
BoostingNearQuery query;
TopDocs hits;
// combine ordered and unordered spans with some nesting to make sure all payloads are counted
SpanQuery q1 = newPhraseQuery("field", "nine hundred", true);
SpanQuery q2 = newPhraseQuery("field", "ninety nine", true);
SpanQuery q3 = newPhraseQuery("field", "nine ninety", false);
SpanQuery q4 = newPhraseQuery("field", "hundred nine", false);
SpanQuery[]clauses = new SpanQuery[] {new BoostingNearQuery(new SpanQuery[] {q1,q2}, 0, true), new BoostingNearQuery(new SpanQuery[] {q3,q4}, 0, false)};
query = new BoostingNearQuery(clauses, 0, false);
hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
// should be only 1 hit - doc 999
assertTrue("should only be one hit", hits.scoreDocs.length == 1);
// the score should be 3 - the average of all the underlying payloads
ScoreDoc doc = hits.scoreDocs[0];
// System.out.println("Doc: " + doc.toString());
// System.out.println("Explain: " + searcher.explain(query, doc.doc));
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
}
// must be static for weight serialization tests
static class BoostingSimilarity extends DefaultSimilarity {
public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
return payload[0];
}
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
public float lengthNorm(String fieldName, int numTerms) {
return 1;
}
public float queryNorm(float sumOfSquaredWeights) {
return 1;
}
public float sloppyFreq(int distance) {
return 1;
}
public float coord(int overlap, int maxOverlap) {
return 1;
}
public float tf(float freq) {
return 1;
}
// idf used for phrase queries
public float idf(Collection terms, Searcher searcher) {
return 1;
}
}
}