LUCENE-1790: small refactor of Payload queries, plus add in some new payload query functionality

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@802174 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2009-08-07 20:34:58 +00:00
parent 911df49bcb
commit e079d1cec3
13 changed files with 756 additions and 147 deletions

View File

@ -664,7 +664,11 @@ New features
disable loading them with a new constructor switch. (Mark Miller)
34. LUCENE-1341: Added BoostingNearQuery to enable SpanNearQuery functionality
with payloads (Peter Keegan, Grant Ingersoll)
with payloads (Peter Keegan, Grant Ingersoll)
35. LUCENE-1790: Added BoostingFunctionTermQuery to enable scoring of payloads
based on the maximum payload seen for a document.
Slight refactoring of Similarity and other payload queries (Grant Ingersoll)
Optimizations

View File

@ -290,6 +290,8 @@ public abstract class Similarity implements Serializable {
/** The Similarity implementation used by default. */
private static Similarity defaultImpl = new DefaultSimilarity();
public static final int NO_DOC_ID_PROVIDED = -1;
/** Set the default Similarity implementation used by indexing and search
* code.
*
@ -529,6 +531,8 @@ public abstract class Similarity implements Serializable {
public abstract float coord(int overlap, int maxOverlap);
/**
* Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
@ -540,11 +544,35 @@ public abstract class Similarity implements Serializable {
* @param payload The payload byte array to be scored
* @param offset The offset into the payload array
* @param length The length in the array
* @return An implementation dependent float to be used as a scoring factor
* @return An implementation dependent float to be used as a scoring factor
*
* @deprecated See {@link #scorePayload(int, String, byte[], int, int)}
*/
public float scorePayload(String fieldName, byte [] payload, int offset, int length)
{
//Do nothing
return scorePayload(NO_DOC_ID_PROVIDED, fieldName, payload, offset, length);
}
/**
* Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
* what is in the byte array.
* <p>
* The default implementation returns 1.
*
* @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information
* @param fieldName The fieldName of the term this payload belongs to
* @param payload The payload byte array to be scored
* @param offset The offset into the payload array
* @param length The length in the array
* @return An implementation dependent float to be used as a scoring factor
*
*/
public float scorePayload(int docId, String fieldName, byte [] payload, int offset, int length)
{
//Do nothing
return 1;
}
}

View File

@ -0,0 +1,36 @@
package org.apache.lucene.search.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Calculate the final score as the average score of all payloads seen.
* <p/>
* Is thread safe and completely reusable.
*
**/
public class AveragePayloadFunction extends PayloadFunction{
public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
return currentPayloadScore + currentScore;
}
public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) {
return numPayloadsSeen > 0 ? (payloadScore / numPayloadsSeen) : 1;
}
}

View File

@ -0,0 +1,180 @@
package org.apache.lucene.search.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.QueryWeight;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.SpanScorer;
import java.io.IOException;
/**
* The score returned is based on the maximum payload score seen for the Term on the document, as opposed
* to the average as implemented by {@link org.apache.lucene.search.payloads.BoostingTermQuery}.
*
**/
public class BoostingFunctionTermQuery extends SpanTermQuery implements PayloadQuery{
protected PayloadFunction function;
private boolean includeSpanScore;
public BoostingFunctionTermQuery(Term term, PayloadFunction function) {
this(term, function, true);
}
public BoostingFunctionTermQuery(Term term, PayloadFunction function, boolean includeSpanScore) {
super(term);
this.function = function;
this.includeSpanScore = includeSpanScore;
}
public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
return new BoostingFunctionTermWeight(this, searcher);
}
protected class BoostingFunctionTermWeight extends SpanWeight {
public BoostingFunctionTermWeight(BoostingFunctionTermQuery query, Searcher searcher) throws IOException {
super(query, searcher);
}
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
return new BoostingFunctionSpanScorer((TermSpans) query.getSpans(reader), this,
similarity, reader.norms(query.getField()));
}
protected class BoostingFunctionSpanScorer extends SpanScorer {
//TODO: is this the best way to allocate this?
protected byte[] payload = new byte[256];
protected TermPositions positions;
protected float payloadScore;
protected int payloadsSeen;
public BoostingFunctionSpanScorer(TermSpans spans, QueryWeight weight, Similarity similarity,
byte[] norms) throws IOException {
super(spans, weight, similarity, norms);
positions = spans.getPositions();
}
protected boolean setFreqCurrentDoc() throws IOException {
if (!more) {
return false;
}
doc = spans.doc();
freq = 0.0f;
payloadScore = 0;
payloadsSeen = 0;
Similarity similarity1 = getSimilarity();
while (more && doc == spans.doc()) {
int matchLength = spans.end() - spans.start();
freq += similarity1.sloppyFreq(matchLength);
processPayload(similarity1);
more = spans.next();//this moves positions to the next match in this document
}
return more || (freq != 0);
}
protected void processPayload(Similarity similarity) throws IOException {
if (positions.isPayloadAvailable()) {
payload = positions.getPayload(payload, 0);
payloadScore = function.currentScore(doc, term.field(), payloadsSeen, payloadScore,
similarity.scorePayload(doc, term.field(), payload, 0, positions.getPayloadLength()));
payloadsSeen++;
} else {
//zero out the payload?
}
}
/**
*
* @return {@link #getSpanScore()} * {@link #getPayloadScore()}
* @throws IOException
*/
public float score() throws IOException {
return includeSpanScore ? getSpanScore() * getPayloadScore() : getPayloadScore();
}
/**
* Returns the SpanScorer score only.
* <p/>
* Should not be overriden without good cause!
*
* @return the score for just the Span part w/o the payload
* @throws IOException
*
* @see #score()
*/
protected float getSpanScore() throws IOException{
return super.score();
}
/**
* The score for the payload
* @return The score, as calculated by {@link PayloadFunction#docScore(int, String, int, float)}
*/
protected float getPayloadScore() {
return function.docScore(doc, term.field(), payloadsSeen, payloadScore);
}
public Explanation explain(final int doc) throws IOException {
ComplexExplanation result = new ComplexExplanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
//QUESTION: Is there a way to avoid this skipTo call? We need to know whether to load the payload or not
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
float payloadScore = getPayloadScore();
payloadBoost.setValue(payloadScore);
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * payloadScore);
result.setDescription("btq, product of:");
result.setMatch(nonPayloadExpl.getValue()==0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
return result;
}
}
}
public boolean equals(Object o) {
if (!(o instanceof BoostingFunctionTermQuery))
return false;
BoostingFunctionTermQuery other = (BoostingFunctionTermQuery) o;
return (this.getBoost() == other.getBoost())
&& this.term.equals(other.term) && this.function.equals(other.function);
}
}

View File

@ -23,39 +23,46 @@ import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.Spans;
import java.io.IOException;
import java.util.Iterator;
import java.util.Collection;
import java.util.Iterator;
/**
* The BoostingNearQuery is very similar to the {@link org.apache.lucene.search.spans.SpanNearQuery} except
* that it factors in the value of the payloads located at each of the positions where the
* {@link org.apache.lucene.search.spans.TermSpans} occurs.
* <p>
* <p/>
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)}
* which returns 1 by default.
* <p>
* Payload scores are averaged across term occurrences in the document.
*
* <p/>
* Payload scores are averaged across term occurrences in the document.
*
* @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int)
*/
public class BoostingNearQuery extends SpanNearQuery {
String fieldName;
public class BoostingNearQuery extends SpanNearQuery implements PayloadQuery {
protected String fieldName;
protected PayloadFunction function;
public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) {
super(clauses, slop, inOrder);
fieldName = clauses[0].getField(); // all clauses must have same field
this(clauses, slop, inOrder, new AveragePayloadFunction());
}
public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder, PayloadFunction function) {
super(clauses, slop, inOrder);
fieldName = clauses[0].getField(); // all clauses must have same field
this.function = function;
}
public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
return new BoostingSpanWeight(this, searcher);
}
@ -70,18 +77,19 @@ public class BoostingNearQuery extends SpanNearQuery {
similarity,
reader.norms(query.getField()));
}
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
return new BoostingSpanScorer(query.getSpans(reader), this,
similarity,
reader.norms(query.getField()));
return new BoostingSpanScorer(query.getSpans(reader), this,
similarity,
reader.norms(query.getField()));
}
}
public class BoostingSpanScorer extends SpanScorer {
Spans spans;
Spans spans;
Spans[] subSpans = null;
protected float payloadScore;
private int payloadsSeen;
private int payloadsSeen;
Similarity similarity = getSimilarity();
protected BoostingSpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
@ -92,58 +100,62 @@ public class BoostingNearQuery extends SpanNearQuery {
// Get the payloads associated with all underlying subspans
public void getPayloads(Spans[] subSpans) throws IOException {
for (int i = 0; i < subSpans.length; i++) {
if (subSpans[i] instanceof NearSpansOrdered) {
if (((NearSpansOrdered)subSpans[i]).isPayloadAvailable()) {
processPayloads(((NearSpansOrdered)subSpans[i]).getPayload());
}
getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans());
} else if (subSpans[i] instanceof NearSpansUnordered) {
if (((NearSpansUnordered)subSpans[i]).isPayloadAvailable()) {
processPayloads(((NearSpansUnordered)subSpans[i]).getPayload());
}
getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans());
}
}
for (int i = 0; i < subSpans.length; i++) {
if (subSpans[i] instanceof NearSpansOrdered) {
if (((NearSpansOrdered) subSpans[i]).isPayloadAvailable()) {
processPayloads(((NearSpansOrdered) subSpans[i]).getPayload());
}
getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans());
} else if (subSpans[i] instanceof NearSpansUnordered) {
if (((NearSpansUnordered) subSpans[i]).isPayloadAvailable()) {
processPayloads(((NearSpansUnordered) subSpans[i]).getPayload());
}
getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans());
}
}
}
/**
* By default, sums the payloads, but can be overridden to do other things.
*
* @param payLoads The payloads
*/
protected void processPayloads(Collection payLoads) {
for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) {
byte[] thePayload = (byte[]) iterator.next();
++payloadsSeen;
payloadScore += similarity.scorePayload(fieldName, thePayload, 0, thePayload.length);
}
}
//
protected boolean setFreqCurrentDoc() throws IOException {
Spans[] spansArr = new Spans[1];
spansArr[0] = spans;
payloadScore = 0;
payloadsSeen = 0;
getPayloads(spansArr);
return super.setFreqCurrentDoc();
}
protected void processPayloads(Collection payLoads) {
for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) {
byte[] thePayload = (byte[]) iterator.next();
payloadScore = function.currentScore(doc, fieldName, payloadsSeen, payloadScore,
similarity.scorePayload(doc, fieldName, thePayload, 0, thePayload.length));
++payloadsSeen;
}
}
//
protected boolean setFreqCurrentDoc() throws IOException {
Spans[] spansArr = new Spans[1];
spansArr[0] = spans;
payloadScore = 0;
payloadsSeen = 0;
getPayloads(spansArr);
return super.setFreqCurrentDoc();
}
public float score() throws IOException {
return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
return super.score() * function.docScore(doc, fieldName, payloadsSeen, payloadScore);
}
public Explanation explain(int doc) throws IOException {
Explanation result = new Explanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
payloadBoost.setValue(avgPayloadScore);
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
result.setDescription("bnq, product of:");
return result;
Explanation result = new Explanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
payloadBoost.setValue(avgPayloadScore);
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
result.setDescription("bnq, product of:");
return result;
}
}

View File

@ -39,106 +39,31 @@ import java.io.IOException;
*
* @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int)
*/
public class BoostingTermQuery extends SpanTermQuery{
public class BoostingTermQuery extends BoostingFunctionTermQuery implements PayloadQuery{
public BoostingTermQuery(Term term) {
super(term);
this(term, true);
}
public BoostingTermQuery(Term term, boolean includeSpanScore) {
super(term, new AveragePayloadFunction(), includeSpanScore);
}
public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
return new BoostingTermWeight(this, searcher);
}
protected class BoostingTermWeight extends SpanWeight {
protected class BoostingTermWeight extends BoostingFunctionTermWeight {
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
super(query, searcher);
}
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
return new BoostingSpanScorer((TermSpans) query.getSpans(reader), this,
return new BoostingFunctionSpanScorer((TermSpans) query.getSpans(reader), this,
similarity, reader.norms(query.getField()));
}
protected class BoostingSpanScorer extends SpanScorer {
//TODO: is this the best way to allocate this?
byte[] payload = new byte[256];
private TermPositions positions;
protected float payloadScore;
private int payloadsSeen;
public BoostingSpanScorer(TermSpans spans, QueryWeight weight,
Similarity similarity, byte[] norms) throws IOException {
super(spans, weight, similarity, norms);
positions = spans.getPositions();
}
protected boolean setFreqCurrentDoc() throws IOException {
if (!more) {
return false;
}
doc = spans.doc();
freq = 0.0f;
payloadScore = 0;
payloadsSeen = 0;
Similarity similarity1 = getSimilarity();
while (more && doc == spans.doc()) {
int matchLength = spans.end() - spans.start();
freq += similarity1.sloppyFreq(matchLength);
processPayload(similarity1);
more = spans.next();//this moves positions to the next match in this document
}
return more || (freq != 0);
}
protected void processPayload(Similarity similarity) throws IOException {
if (positions.isPayloadAvailable()) {
payload = positions.getPayload(payload, 0);
payloadScore += similarity.scorePayload(term.field(), payload, 0, positions.getPayloadLength());
payloadsSeen++;
} else {
//zero out the payload?
}
}
public float score() throws IOException {
return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
}
public Explanation explain(final int doc) throws IOException {
ComplexExplanation result = new ComplexExplanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
//QUESTION: Is there a wau to avoid this skipTo call? We need to know whether to load the payload or not
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
/*
if (skipTo(doc) == true) {
processPayload();
}
*/
float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
payloadBoost.setValue(avgPayloadScore);
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
result.setDescription("btq, product of:");
result.setMatch(nonPayloadExpl.getValue()==0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
return result;
}
}
}

View File

@ -0,0 +1,34 @@
package org.apache.lucene.search.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Returns the maximum payload score seen, else 1 if there are no payloads on the doc.
* <p/>
* Is thread safe and completely reusable.
*
**/
public class MaxPayloadFunction extends PayloadFunction{
public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
return Math.max(currentPayloadScore, currentScore);
}
public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) {
return numPayloadsSeen > 0 ? payloadScore : 1;
}
}

View File

@ -0,0 +1,18 @@
package org.apache.lucene.search.payloads;
/**
* Calculates the miniumum payload seen
*
**/
public class MinPayloadFunction extends PayloadFunction {
public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
return Math.min(currentPayloadScore, currentScore);
}
public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) {
return numPayloadsSeen > 0 ? payloadScore : 1;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.search.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
/**
* An abstract class that defines a way for Boosting*Query instances
* to transform the cumulative effects of payload scores for a document.
*
* @see org.apache.lucene.search.payloads.BoostingFunctionTermQuery for more information
*
* <p/>
* This class and its derivations are experimental and subject to change
*
**/
public abstract class PayloadFunction implements Serializable {
/**
* Calculate the score up to this point for this doc and field
* @param docId The current doc
* @param field The current field
* @param numPayloadsSeen The number of payloads seen so far
* @param currentScore The current score so far
* @param currentPayloadScore The score for the current payload
* @return The new current score
*/
public abstract float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore);
/**
* Calculate the final score for all the payloads seen so far for this doc/field
* @param docId The current doc
* @param field The current field
* @param numPayloadsSeen The total number of payloads seen on this document
* @param payloadScore The raw score for those payloads
* @return The final score for the payloads
*/
public abstract float docScore(int docId, String field, int numPayloadsSeen, float payloadScore);
}

View File

@ -0,0 +1,9 @@
package org.apache.lucene.search.payloads;
/**
* Marker interface inidcating this Query is Payload aware
*
**/
public interface PayloadQuery {
}

View File

@ -0,0 +1,305 @@
package org.apache.lucene.search.payloads;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.English;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.Reader;
import java.io.IOException;
/**
*
*
**/
public class BoostingFunctionTermQueryTest extends LuceneTestCase {
private IndexSearcher searcher;
private BoostingSimilarity similarity = new BoostingSimilarity();
private byte[] payloadField = new byte[]{1};
private byte[] payloadMultiField1 = new byte[]{2};
private byte[] payloadMultiField2 = new byte[]{4};
protected RAMDirectory directory;
public BoostingFunctionTermQueryTest(String s) {
super(s);
}
private class PayloadAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new LowerCaseTokenizer(reader);
result = new PayloadFilter(result, fieldName);
return result;
}
}
private class PayloadFilter extends TokenFilter {
String fieldName;
int numSeen = 0;
PayloadAttribute payloadAtt;
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
this.fieldName = fieldName;
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (hasNext) {
if (fieldName.equals("field")) {
payloadAtt.setPayload(new Payload(payloadField));
} else if (fieldName.equals("multiField")) {
if (numSeen % 2 == 0) {
payloadAtt.setPayload(new Payload(payloadMultiField1));
} else {
payloadAtt.setPayload(new Payload(payloadMultiField2));
}
numSeen++;
}
return true;
} else {
return false;
}
}
}
protected void setUp() throws Exception {
super.setUp();
directory = new RAMDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer
= new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
writer.setSimilarity(similarity);
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
Field noPayloadField = new Field(PayloadHelper.NO_PAYLOAD_FIELD, English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED);
//noPayloadField.setBoost(0);
doc.add(noPayloadField);
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("multiField", English.intToEnglish(i) + " " + English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
searcher = new IndexSearcher(directory, true);
searcher.setSimilarity(similarity);
}
public void test() throws IOException {
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term("field", "seventy"),
new MaxPayloadFunction());
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
//they should all have the exact same score, because they all contain seventy once, and we set
//all the other similarity factors to be 1
assertTrue(hits.getMaxScore() + " does not equal: " + 1, hits.getMaxScore() == 1);
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc doc = hits.scoreDocs[i];
assertTrue(doc.score + " does not equal: " + 1, doc.score == 1);
}
CheckHits.checkExplanations(query, PayloadHelper.FIELD, searcher, true);
Spans spans = query.getSpans(searcher.getIndexReader());
assertTrue("spans is null and it shouldn't be", spans != null);
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
/*float score = hits.score(0);
for (int i =1; i < hits.length(); i++)
{
assertTrue("scores are not equal and they should be", score == hits.score(i));
}*/
}
public void testMultipleMatchesPerDoc() throws Exception {
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.MULTI_FIELD, "seventy"),
new MaxPayloadFunction());
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
//they should all have the exact same score, because they all contain seventy once, and we set
//all the other similarity factors to be 1
//System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
assertTrue(hits.getMaxScore() + " does not equal: " + 4.0, hits.getMaxScore() == 4.0);
//there should be exactly 10 items that score a 4, all the rest should score a 2
//The 10 items are: 70 + i*100 where i in [0-9]
int numTens = 0;
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc doc = hits.scoreDocs[i];
if (doc.doc % 10 == 0) {
numTens++;
assertTrue(doc.score + " does not equal: " + 4.0, doc.score == 4.0);
} else {
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
}
}
assertTrue(numTens + " does not equal: " + 10, numTens == 10);
CheckHits.checkExplanations(query, "field", searcher, true);
Spans spans = query.getSpans(searcher.getIndexReader());
assertTrue("spans is null and it shouldn't be", spans != null);
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
//should be two matches per document
int count = 0;
//100 hits times 2 matches per hit, we should have 200 in count
while (spans.next()) {
count++;
}
assertTrue(count + " does not equal: " + 200, count == 200);
}
//Set includeSpanScore to false, in which case just the payload score comes through.
public void testIgnoreSpanScorer() throws Exception {
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.MULTI_FIELD, "seventy"),
new MaxPayloadFunction(), false);
IndexSearcher theSearcher = new IndexSearcher(directory, true);
theSearcher.setSimilarity(new FullSimilarity());
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
//they should all have the exact same score, because they all contain seventy once, and we set
//all the other similarity factors to be 1
//System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
assertTrue(hits.getMaxScore() + " does not equal: " + 4.0, hits.getMaxScore() == 4.0);
//there should be exactly 10 items that score a 4, all the rest should score a 2
//The 10 items are: 70 + i*100 where i in [0-9]
int numTens = 0;
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc doc = hits.scoreDocs[i];
if (doc.doc % 10 == 0) {
numTens++;
assertTrue(doc.score + " does not equal: " + 4.0, doc.score == 4.0);
} else {
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
}
}
assertTrue(numTens + " does not equal: " + 10, numTens == 10);
CheckHits.checkExplanations(query, "field", searcher, true);
Spans spans = query.getSpans(searcher.getIndexReader());
assertTrue("spans is null and it shouldn't be", spans != null);
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
//should be two matches per document
int count = 0;
//100 hits times 2 matches per hit, we should have 200 in count
while (spans.next()) {
count++;
}
}
public void testNoMatch() throws Exception {
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.FIELD, "junk"),
new MaxPayloadFunction());
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 0, hits.totalHits == 0);
}
public void testNoPayload() throws Exception {
BoostingFunctionTermQuery q1 = new BoostingFunctionTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "zero"),
new MaxPayloadFunction());
BoostingFunctionTermQuery q2 = new BoostingFunctionTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "foo"),
new MaxPayloadFunction());
BooleanClause c1 = new BooleanClause(q1, BooleanClause.Occur.MUST);
BooleanClause c2 = new BooleanClause(q2, BooleanClause.Occur.MUST_NOT);
BooleanQuery query = new BooleanQuery();
query.add(c1);
query.add(c2);
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 1, hits.totalHits == 1);
int[] results = new int[1];
results[0] = 0;//hits.scoreDocs[0].doc;
CheckHits.checkHitCollector(query, PayloadHelper.NO_PAYLOAD_FIELD, searcher, results);
}
// must be static for weight serialization tests
static class BoostingSimilarity extends DefaultSimilarity {
// TODO: Remove warning after API has been finalized
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
//we know it is size 4 here, so ignore the offset/length
return payload[0];
}
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
public float lengthNorm(String fieldName, int numTerms) {
return 1;
}
public float queryNorm(float sumOfSquaredWeights) {
return 1;
}
public float sloppyFreq(int distance) {
return 1;
}
public float coord(int overlap, int maxOverlap) {
return 1;
}
public float idf(int docFreq, int numDocs) {
return 1;
}
public float tf(float freq) {
return freq == 0 ? 0 : 1;
}
}
static class FullSimilarity extends DefaultSimilarity{
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
//we know it is size 4 here, so ignore the offset/length
return payload[0];
}
}
}

View File

@ -184,7 +184,7 @@ public class TestBoostingNearQuery extends LuceneTestCase {
// must be static for weight serialization tests
static class BoostingSimilarity extends DefaultSimilarity {
public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
return payload[0];
}

View File

@ -210,7 +210,7 @@ public class TestBoostingTermQuery extends LuceneTestCase {
static class BoostingSimilarity extends DefaultSimilarity {
// TODO: Remove warning after API has been finalized
public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
//we know it is size 4 here, so ignore the offset/length
return payload[0];
}