mirror of https://github.com/apache/lucene.git
LUCENE-1790: small refactor of Payload queries, plus add in some new payload query functionality
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@802174 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
911df49bcb
commit
e079d1cec3
|
@ -666,6 +666,10 @@ New features
|
|||
34. LUCENE-1341: Added BoostingNearQuery to enable SpanNearQuery functionality
|
||||
with payloads (Peter Keegan, Grant Ingersoll)
|
||||
|
||||
35. LUCENE-1790: Added BoostingFunctionTermQuery to enable scoring of payloads
|
||||
based on the maximum payload seen for a document.
|
||||
Slight refactoring of Similarity and other payload queries (Grant Ingersoll)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
|
||||
|
|
|
@ -290,6 +290,8 @@ public abstract class Similarity implements Serializable {
|
|||
/** The Similarity implementation used by default. */
|
||||
private static Similarity defaultImpl = new DefaultSimilarity();
|
||||
|
||||
public static final int NO_DOC_ID_PROVIDED = -1;
|
||||
|
||||
/** Set the default Similarity implementation used by indexing and search
|
||||
* code.
|
||||
*
|
||||
|
@ -529,6 +531,8 @@ public abstract class Similarity implements Serializable {
|
|||
public abstract float coord(int overlap, int maxOverlap);
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Calculate a scoring factor based on the data in the payload. Overriding implementations
|
||||
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
|
||||
|
@ -541,10 +545,34 @@ public abstract class Similarity implements Serializable {
|
|||
* @param offset The offset into the payload array
|
||||
* @param length The length in the array
|
||||
* @return An implementation dependent float to be used as a scoring factor
|
||||
*
|
||||
* @deprecated See {@link #scorePayload(int, String, byte[], int, int)}
|
||||
*/
|
||||
public float scorePayload(String fieldName, byte [] payload, int offset, int length)
|
||||
{
|
||||
//Do nothing
|
||||
return scorePayload(NO_DOC_ID_PROVIDED, fieldName, payload, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate a scoring factor based on the data in the payload. Overriding implementations
|
||||
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
|
||||
* what is in the byte array.
|
||||
* <p>
|
||||
* The default implementation returns 1.
|
||||
*
|
||||
* @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information
|
||||
* @param fieldName The fieldName of the term this payload belongs to
|
||||
* @param payload The payload byte array to be scored
|
||||
* @param offset The offset into the payload array
|
||||
* @param length The length in the array
|
||||
* @return An implementation dependent float to be used as a scoring factor
|
||||
*
|
||||
*/
|
||||
public float scorePayload(int docId, String fieldName, byte [] payload, int offset, int length)
|
||||
{
|
||||
//Do nothing
|
||||
return 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the final score as the average score of all payloads seen.
|
||||
* <p/>
|
||||
* Is thread safe and completely reusable.
|
||||
*
|
||||
**/
|
||||
public class AveragePayloadFunction extends PayloadFunction{
|
||||
|
||||
public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
|
||||
return currentPayloadScore + currentScore;
|
||||
}
|
||||
|
||||
public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) {
|
||||
return numPayloadsSeen > 0 ? (payloadScore / numPayloadsSeen) : 1;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.QueryWeight;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.ComplexExplanation;
|
||||
import org.apache.lucene.search.spans.TermSpans;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
import org.apache.lucene.search.spans.SpanScorer;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* The score returned is based on the maximum payload score seen for the Term on the document, as opposed
|
||||
* to the average as implemented by {@link org.apache.lucene.search.payloads.BoostingTermQuery}.
|
||||
*
|
||||
**/
|
||||
public class BoostingFunctionTermQuery extends SpanTermQuery implements PayloadQuery{
|
||||
protected PayloadFunction function;
|
||||
private boolean includeSpanScore;
|
||||
|
||||
public BoostingFunctionTermQuery(Term term, PayloadFunction function) {
|
||||
this(term, function, true);
|
||||
}
|
||||
|
||||
public BoostingFunctionTermQuery(Term term, PayloadFunction function, boolean includeSpanScore) {
|
||||
super(term);
|
||||
this.function = function;
|
||||
this.includeSpanScore = includeSpanScore;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
|
||||
return new BoostingFunctionTermWeight(this, searcher);
|
||||
}
|
||||
|
||||
protected class BoostingFunctionTermWeight extends SpanWeight {
|
||||
|
||||
public BoostingFunctionTermWeight(BoostingFunctionTermQuery query, Searcher searcher) throws IOException {
|
||||
super(query, searcher);
|
||||
}
|
||||
|
||||
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
|
||||
return new BoostingFunctionSpanScorer((TermSpans) query.getSpans(reader), this,
|
||||
similarity, reader.norms(query.getField()));
|
||||
}
|
||||
|
||||
protected class BoostingFunctionSpanScorer extends SpanScorer {
|
||||
//TODO: is this the best way to allocate this?
|
||||
protected byte[] payload = new byte[256];
|
||||
protected TermPositions positions;
|
||||
protected float payloadScore;
|
||||
protected int payloadsSeen;
|
||||
|
||||
public BoostingFunctionSpanScorer(TermSpans spans, QueryWeight weight, Similarity similarity,
|
||||
byte[] norms) throws IOException {
|
||||
super(spans, weight, similarity, norms);
|
||||
positions = spans.getPositions();
|
||||
}
|
||||
|
||||
protected boolean setFreqCurrentDoc() throws IOException {
|
||||
if (!more) {
|
||||
return false;
|
||||
}
|
||||
doc = spans.doc();
|
||||
freq = 0.0f;
|
||||
payloadScore = 0;
|
||||
payloadsSeen = 0;
|
||||
Similarity similarity1 = getSimilarity();
|
||||
while (more && doc == spans.doc()) {
|
||||
int matchLength = spans.end() - spans.start();
|
||||
|
||||
freq += similarity1.sloppyFreq(matchLength);
|
||||
processPayload(similarity1);
|
||||
|
||||
more = spans.next();//this moves positions to the next match in this document
|
||||
}
|
||||
return more || (freq != 0);
|
||||
}
|
||||
|
||||
|
||||
protected void processPayload(Similarity similarity) throws IOException {
|
||||
if (positions.isPayloadAvailable()) {
|
||||
payload = positions.getPayload(payload, 0);
|
||||
payloadScore = function.currentScore(doc, term.field(), payloadsSeen, payloadScore,
|
||||
similarity.scorePayload(doc, term.field(), payload, 0, positions.getPayloadLength()));
|
||||
payloadsSeen++;
|
||||
|
||||
} else {
|
||||
//zero out the payload?
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return {@link #getSpanScore()} * {@link #getPayloadScore()}
|
||||
* @throws IOException
|
||||
*/
|
||||
public float score() throws IOException {
|
||||
|
||||
return includeSpanScore ? getSpanScore() * getPayloadScore() : getPayloadScore();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the SpanScorer score only.
|
||||
* <p/>
|
||||
* Should not be overriden without good cause!
|
||||
*
|
||||
* @return the score for just the Span part w/o the payload
|
||||
* @throws IOException
|
||||
*
|
||||
* @see #score()
|
||||
*/
|
||||
protected float getSpanScore() throws IOException{
|
||||
return super.score();
|
||||
}
|
||||
|
||||
/**
|
||||
* The score for the payload
|
||||
* @return The score, as calculated by {@link PayloadFunction#docScore(int, String, int, float)}
|
||||
*/
|
||||
protected float getPayloadScore() {
|
||||
return function.docScore(doc, term.field(), payloadsSeen, payloadScore);
|
||||
}
|
||||
|
||||
|
||||
public Explanation explain(final int doc) throws IOException {
|
||||
ComplexExplanation result = new ComplexExplanation();
|
||||
Explanation nonPayloadExpl = super.explain(doc);
|
||||
result.addDetail(nonPayloadExpl);
|
||||
//QUESTION: Is there a way to avoid this skipTo call? We need to know whether to load the payload or not
|
||||
Explanation payloadBoost = new Explanation();
|
||||
result.addDetail(payloadBoost);
|
||||
|
||||
|
||||
float payloadScore = getPayloadScore();
|
||||
payloadBoost.setValue(payloadScore);
|
||||
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
|
||||
payloadBoost.setDescription("scorePayload(...)");
|
||||
result.setValue(nonPayloadExpl.getValue() * payloadScore);
|
||||
result.setDescription("btq, product of:");
|
||||
result.setMatch(nonPayloadExpl.getValue()==0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof BoostingFunctionTermQuery))
|
||||
return false;
|
||||
BoostingFunctionTermQuery other = (BoostingFunctionTermQuery) o;
|
||||
return (this.getBoost() == other.getBoost())
|
||||
&& this.term.equals(other.term) && this.function.equals(other.function);
|
||||
}
|
||||
}
|
|
@ -23,39 +23,46 @@ import org.apache.lucene.search.Scorer;
|
|||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
import org.apache.lucene.search.spans.SpanScorer;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.search.spans.NearSpansOrdered;
|
||||
import org.apache.lucene.search.spans.NearSpansUnordered;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanScorer;
|
||||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* The BoostingNearQuery is very similar to the {@link org.apache.lucene.search.spans.SpanNearQuery} except
|
||||
* that it factors in the value of the payloads located at each of the positions where the
|
||||
* {@link org.apache.lucene.search.spans.TermSpans} occurs.
|
||||
* <p>
|
||||
* <p/>
|
||||
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)}
|
||||
* which returns 1 by default.
|
||||
* <p>
|
||||
* <p/>
|
||||
* Payload scores are averaged across term occurrences in the document.
|
||||
*
|
||||
* @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int)
|
||||
*/
|
||||
|
||||
public class BoostingNearQuery extends SpanNearQuery {
|
||||
String fieldName;
|
||||
public class BoostingNearQuery extends SpanNearQuery implements PayloadQuery {
|
||||
protected String fieldName;
|
||||
protected PayloadFunction function;
|
||||
|
||||
public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) {
|
||||
this(clauses, slop, inOrder, new AveragePayloadFunction());
|
||||
}
|
||||
|
||||
public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder, PayloadFunction function) {
|
||||
super(clauses, slop, inOrder);
|
||||
fieldName = clauses[0].getField(); // all clauses must have same field
|
||||
this.function = function;
|
||||
}
|
||||
|
||||
|
||||
public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
|
||||
return new BoostingSpanWeight(this, searcher);
|
||||
}
|
||||
|
@ -70,6 +77,7 @@ public class BoostingNearQuery extends SpanNearQuery {
|
|||
similarity,
|
||||
reader.norms(query.getField()));
|
||||
}
|
||||
|
||||
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
|
||||
return new BoostingSpanScorer(query.getSpans(reader), this,
|
||||
similarity,
|
||||
|
@ -109,15 +117,18 @@ public class BoostingNearQuery extends SpanNearQuery {
|
|||
|
||||
/**
|
||||
* By default, sums the payloads, but can be overridden to do other things.
|
||||
*
|
||||
* @param payLoads The payloads
|
||||
*/
|
||||
protected void processPayloads(Collection payLoads) {
|
||||
for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) {
|
||||
byte[] thePayload = (byte[]) iterator.next();
|
||||
payloadScore = function.currentScore(doc, fieldName, payloadsSeen, payloadScore,
|
||||
similarity.scorePayload(doc, fieldName, thePayload, 0, thePayload.length));
|
||||
++payloadsSeen;
|
||||
payloadScore += similarity.scorePayload(fieldName, thePayload, 0, thePayload.length);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
protected boolean setFreqCurrentDoc() throws IOException {
|
||||
Spans[] spansArr = new Spans[1];
|
||||
|
@ -130,8 +141,9 @@ public class BoostingNearQuery extends SpanNearQuery {
|
|||
|
||||
public float score() throws IOException {
|
||||
|
||||
return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
|
||||
return super.score() * function.docScore(doc, fieldName, payloadsSeen, payloadScore);
|
||||
}
|
||||
|
||||
public Explanation explain(int doc) throws IOException {
|
||||
Explanation result = new Explanation();
|
||||
Explanation nonPayloadExpl = super.explain(doc);
|
||||
|
|
|
@ -39,106 +39,31 @@ import java.io.IOException;
|
|||
*
|
||||
* @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int)
|
||||
*/
|
||||
public class BoostingTermQuery extends SpanTermQuery{
|
||||
public class BoostingTermQuery extends BoostingFunctionTermQuery implements PayloadQuery{
|
||||
|
||||
public BoostingTermQuery(Term term) {
|
||||
super(term);
|
||||
this(term, true);
|
||||
}
|
||||
|
||||
public BoostingTermQuery(Term term, boolean includeSpanScore) {
|
||||
super(term, new AveragePayloadFunction(), includeSpanScore);
|
||||
}
|
||||
|
||||
public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
|
||||
return new BoostingTermWeight(this, searcher);
|
||||
}
|
||||
|
||||
protected class BoostingTermWeight extends SpanWeight {
|
||||
protected class BoostingTermWeight extends BoostingFunctionTermWeight {
|
||||
|
||||
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
|
||||
super(query, searcher);
|
||||
}
|
||||
|
||||
public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
|
||||
return new BoostingSpanScorer((TermSpans) query.getSpans(reader), this,
|
||||
return new BoostingFunctionSpanScorer((TermSpans) query.getSpans(reader), this,
|
||||
similarity, reader.norms(query.getField()));
|
||||
}
|
||||
|
||||
protected class BoostingSpanScorer extends SpanScorer {
|
||||
|
||||
//TODO: is this the best way to allocate this?
|
||||
byte[] payload = new byte[256];
|
||||
private TermPositions positions;
|
||||
protected float payloadScore;
|
||||
private int payloadsSeen;
|
||||
|
||||
public BoostingSpanScorer(TermSpans spans, QueryWeight weight,
|
||||
Similarity similarity, byte[] norms) throws IOException {
|
||||
super(spans, weight, similarity, norms);
|
||||
positions = spans.getPositions();
|
||||
|
||||
}
|
||||
|
||||
protected boolean setFreqCurrentDoc() throws IOException {
|
||||
if (!more) {
|
||||
return false;
|
||||
}
|
||||
doc = spans.doc();
|
||||
freq = 0.0f;
|
||||
payloadScore = 0;
|
||||
payloadsSeen = 0;
|
||||
Similarity similarity1 = getSimilarity();
|
||||
while (more && doc == spans.doc()) {
|
||||
int matchLength = spans.end() - spans.start();
|
||||
|
||||
freq += similarity1.sloppyFreq(matchLength);
|
||||
processPayload(similarity1);
|
||||
|
||||
more = spans.next();//this moves positions to the next match in this document
|
||||
}
|
||||
return more || (freq != 0);
|
||||
}
|
||||
|
||||
|
||||
protected void processPayload(Similarity similarity) throws IOException {
|
||||
if (positions.isPayloadAvailable()) {
|
||||
payload = positions.getPayload(payload, 0);
|
||||
payloadScore += similarity.scorePayload(term.field(), payload, 0, positions.getPayloadLength());
|
||||
payloadsSeen++;
|
||||
|
||||
} else {
|
||||
//zero out the payload?
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public float score() throws IOException {
|
||||
|
||||
return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
|
||||
}
|
||||
|
||||
|
||||
public Explanation explain(final int doc) throws IOException {
|
||||
ComplexExplanation result = new ComplexExplanation();
|
||||
Explanation nonPayloadExpl = super.explain(doc);
|
||||
result.addDetail(nonPayloadExpl);
|
||||
//QUESTION: Is there a wau to avoid this skipTo call? We need to know whether to load the payload or not
|
||||
|
||||
Explanation payloadBoost = new Explanation();
|
||||
result.addDetail(payloadBoost);
|
||||
/*
|
||||
if (skipTo(doc) == true) {
|
||||
processPayload();
|
||||
}
|
||||
*/
|
||||
|
||||
float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
|
||||
payloadBoost.setValue(avgPayloadScore);
|
||||
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
|
||||
payloadBoost.setDescription("scorePayload(...)");
|
||||
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
|
||||
result.setDescription("btq, product of:");
|
||||
result.setMatch(nonPayloadExpl.getValue()==0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maximum payload score seen, else 1 if there are no payloads on the doc.
|
||||
* <p/>
|
||||
* Is thread safe and completely reusable.
|
||||
*
|
||||
**/
|
||||
public class MaxPayloadFunction extends PayloadFunction{
|
||||
public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
|
||||
return Math.max(currentPayloadScore, currentScore);
|
||||
}
|
||||
|
||||
public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) {
|
||||
return numPayloadsSeen > 0 ? payloadScore : 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the miniumum payload seen
|
||||
*
|
||||
**/
|
||||
public class MinPayloadFunction extends PayloadFunction {
|
||||
|
||||
public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) {
|
||||
return Math.min(currentPayloadScore, currentScore);
|
||||
}
|
||||
|
||||
public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) {
|
||||
return numPayloadsSeen > 0 ? payloadScore : 1;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
|
||||
/**
|
||||
* An abstract class that defines a way for Boosting*Query instances
|
||||
* to transform the cumulative effects of payload scores for a document.
|
||||
*
|
||||
* @see org.apache.lucene.search.payloads.BoostingFunctionTermQuery for more information
|
||||
*
|
||||
* <p/>
|
||||
* This class and its derivations are experimental and subject to change
|
||||
*
|
||||
**/
|
||||
public abstract class PayloadFunction implements Serializable {
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the score up to this point for this doc and field
|
||||
* @param docId The current doc
|
||||
* @param field The current field
|
||||
* @param numPayloadsSeen The number of payloads seen so far
|
||||
* @param currentScore The current score so far
|
||||
* @param currentPayloadScore The score for the current payload
|
||||
* @return The new current score
|
||||
*/
|
||||
public abstract float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore);
|
||||
|
||||
/**
|
||||
* Calculate the final score for all the payloads seen so far for this doc/field
|
||||
* @param docId The current doc
|
||||
* @param field The current field
|
||||
* @param numPayloadsSeen The total number of payloads seen on this document
|
||||
* @param payloadScore The raw score for those payloads
|
||||
* @return The final score for the payloads
|
||||
*/
|
||||
public abstract float docScore(int docId, String field, int numPayloadsSeen, float payloadScore);
|
||||
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
|
||||
|
||||
/**
|
||||
* Marker interface inidcating this Query is Payload aware
|
||||
*
|
||||
**/
|
||||
public interface PayloadQuery {
|
||||
}
|
|
@ -0,0 +1,305 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.CheckHits;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.search.spans.TermSpans;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class BoostingFunctionTermQueryTest extends LuceneTestCase {
|
||||
private IndexSearcher searcher;
|
||||
private BoostingSimilarity similarity = new BoostingSimilarity();
|
||||
private byte[] payloadField = new byte[]{1};
|
||||
private byte[] payloadMultiField1 = new byte[]{2};
|
||||
private byte[] payloadMultiField2 = new byte[]{4};
|
||||
protected RAMDirectory directory;
|
||||
|
||||
public BoostingFunctionTermQueryTest(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
private class PayloadAnalyzer extends Analyzer {
|
||||
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new LowerCaseTokenizer(reader);
|
||||
result = new PayloadFilter(result, fieldName);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private class PayloadFilter extends TokenFilter {
|
||||
String fieldName;
|
||||
int numSeen = 0;
|
||||
|
||||
PayloadAttribute payloadAtt;
|
||||
|
||||
public PayloadFilter(TokenStream input, String fieldName) {
|
||||
super(input);
|
||||
this.fieldName = fieldName;
|
||||
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
boolean hasNext = input.incrementToken();
|
||||
if (hasNext) {
|
||||
if (fieldName.equals("field")) {
|
||||
payloadAtt.setPayload(new Payload(payloadField));
|
||||
} else if (fieldName.equals("multiField")) {
|
||||
if (numSeen % 2 == 0) {
|
||||
payloadAtt.setPayload(new Payload(payloadMultiField1));
|
||||
} else {
|
||||
payloadAtt.setPayload(new Payload(payloadMultiField2));
|
||||
}
|
||||
numSeen++;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = new RAMDirectory();
|
||||
PayloadAnalyzer analyzer = new PayloadAnalyzer();
|
||||
IndexWriter writer
|
||||
= new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
|
||||
writer.setSimilarity(similarity);
|
||||
//writer.infoStream = System.out;
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
Document doc = new Document();
|
||||
Field noPayloadField = new Field(PayloadHelper.NO_PAYLOAD_FIELD, English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED);
|
||||
//noPayloadField.setBoost(0);
|
||||
doc.add(noPayloadField);
|
||||
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("multiField", English.intToEnglish(i) + " " + English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
searcher = new IndexSearcher(directory, true);
|
||||
searcher.setSimilarity(similarity);
|
||||
}
|
||||
|
||||
public void test() throws IOException {
|
||||
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term("field", "seventy"),
|
||||
new MaxPayloadFunction());
|
||||
TopDocs hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
|
||||
|
||||
//they should all have the exact same score, because they all contain seventy once, and we set
|
||||
//all the other similarity factors to be 1
|
||||
|
||||
assertTrue(hits.getMaxScore() + " does not equal: " + 1, hits.getMaxScore() == 1);
|
||||
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||
ScoreDoc doc = hits.scoreDocs[i];
|
||||
assertTrue(doc.score + " does not equal: " + 1, doc.score == 1);
|
||||
}
|
||||
CheckHits.checkExplanations(query, PayloadHelper.FIELD, searcher, true);
|
||||
Spans spans = query.getSpans(searcher.getIndexReader());
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
|
||||
/*float score = hits.score(0);
|
||||
for (int i =1; i < hits.length(); i++)
|
||||
{
|
||||
assertTrue("scores are not equal and they should be", score == hits.score(i));
|
||||
}*/
|
||||
|
||||
}
|
||||
|
||||
public void testMultipleMatchesPerDoc() throws Exception {
|
||||
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.MULTI_FIELD, "seventy"),
|
||||
new MaxPayloadFunction());
|
||||
TopDocs hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
|
||||
|
||||
//they should all have the exact same score, because they all contain seventy once, and we set
|
||||
//all the other similarity factors to be 1
|
||||
|
||||
//System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
|
||||
assertTrue(hits.getMaxScore() + " does not equal: " + 4.0, hits.getMaxScore() == 4.0);
|
||||
//there should be exactly 10 items that score a 4, all the rest should score a 2
|
||||
//The 10 items are: 70 + i*100 where i in [0-9]
|
||||
int numTens = 0;
|
||||
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||
ScoreDoc doc = hits.scoreDocs[i];
|
||||
if (doc.doc % 10 == 0) {
|
||||
numTens++;
|
||||
assertTrue(doc.score + " does not equal: " + 4.0, doc.score == 4.0);
|
||||
} else {
|
||||
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
|
||||
}
|
||||
}
|
||||
assertTrue(numTens + " does not equal: " + 10, numTens == 10);
|
||||
CheckHits.checkExplanations(query, "field", searcher, true);
|
||||
Spans spans = query.getSpans(searcher.getIndexReader());
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
|
||||
//should be two matches per document
|
||||
int count = 0;
|
||||
//100 hits times 2 matches per hit, we should have 200 in count
|
||||
while (spans.next()) {
|
||||
count++;
|
||||
}
|
||||
assertTrue(count + " does not equal: " + 200, count == 200);
|
||||
}
|
||||
|
||||
//Set includeSpanScore to false, in which case just the payload score comes through.
|
||||
public void testIgnoreSpanScorer() throws Exception {
|
||||
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.MULTI_FIELD, "seventy"),
|
||||
new MaxPayloadFunction(), false);
|
||||
|
||||
IndexSearcher theSearcher = new IndexSearcher(directory, true);
|
||||
theSearcher.setSimilarity(new FullSimilarity());
|
||||
TopDocs hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
|
||||
|
||||
//they should all have the exact same score, because they all contain seventy once, and we set
|
||||
//all the other similarity factors to be 1
|
||||
|
||||
//System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
|
||||
assertTrue(hits.getMaxScore() + " does not equal: " + 4.0, hits.getMaxScore() == 4.0);
|
||||
//there should be exactly 10 items that score a 4, all the rest should score a 2
|
||||
//The 10 items are: 70 + i*100 where i in [0-9]
|
||||
int numTens = 0;
|
||||
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||
ScoreDoc doc = hits.scoreDocs[i];
|
||||
if (doc.doc % 10 == 0) {
|
||||
numTens++;
|
||||
assertTrue(doc.score + " does not equal: " + 4.0, doc.score == 4.0);
|
||||
} else {
|
||||
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
|
||||
}
|
||||
}
|
||||
assertTrue(numTens + " does not equal: " + 10, numTens == 10);
|
||||
CheckHits.checkExplanations(query, "field", searcher, true);
|
||||
Spans spans = query.getSpans(searcher.getIndexReader());
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
|
||||
//should be two matches per document
|
||||
int count = 0;
|
||||
//100 hits times 2 matches per hit, we should have 200 in count
|
||||
while (spans.next()) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
public void testNoMatch() throws Exception {
|
||||
BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.FIELD, "junk"),
|
||||
new MaxPayloadFunction());
|
||||
TopDocs hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("hits Size: " + hits.totalHits + " is not: " + 0, hits.totalHits == 0);
|
||||
|
||||
}
|
||||
|
||||
public void testNoPayload() throws Exception {
|
||||
BoostingFunctionTermQuery q1 = new BoostingFunctionTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "zero"),
|
||||
new MaxPayloadFunction());
|
||||
BoostingFunctionTermQuery q2 = new BoostingFunctionTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "foo"),
|
||||
new MaxPayloadFunction());
|
||||
BooleanClause c1 = new BooleanClause(q1, BooleanClause.Occur.MUST);
|
||||
BooleanClause c2 = new BooleanClause(q2, BooleanClause.Occur.MUST_NOT);
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
query.add(c1);
|
||||
query.add(c2);
|
||||
TopDocs hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("hits Size: " + hits.totalHits + " is not: " + 1, hits.totalHits == 1);
|
||||
int[] results = new int[1];
|
||||
results[0] = 0;//hits.scoreDocs[0].doc;
|
||||
CheckHits.checkHitCollector(query, PayloadHelper.NO_PAYLOAD_FIELD, searcher, results);
|
||||
}
|
||||
|
||||
// must be static for weight serialization tests
|
||||
static class BoostingSimilarity extends DefaultSimilarity {
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
|
||||
//we know it is size 4 here, so ignore the offset/length
|
||||
return payload[0];
|
||||
}
|
||||
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
//Make everything else 1 so we see the effect of the payload
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
public float lengthNorm(String fieldName, int numTerms) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float queryNorm(float sumOfSquaredWeights) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float sloppyFreq(int distance) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float idf(int docFreq, int numDocs) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float tf(float freq) {
|
||||
return freq == 0 ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
static class FullSimilarity extends DefaultSimilarity{
|
||||
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
|
||||
//we know it is size 4 here, so ignore the offset/length
|
||||
return payload[0];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -184,7 +184,7 @@ public class TestBoostingNearQuery extends LuceneTestCase {
|
|||
// must be static for weight serialization tests
|
||||
static class BoostingSimilarity extends DefaultSimilarity {
|
||||
|
||||
public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
|
||||
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
|
||||
return payload[0];
|
||||
}
|
||||
|
||||
|
|
|
@ -210,7 +210,7 @@ public class TestBoostingTermQuery extends LuceneTestCase {
|
|||
static class BoostingSimilarity extends DefaultSimilarity {
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
|
||||
public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) {
|
||||
//we know it is size 4 here, so ignore the offset/length
|
||||
return payload[0];
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue