From e079d1cec34ffd1dbf2b04e1ab3a9c5820b9452b Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Fri, 7 Aug 2009 20:34:58 +0000 Subject: [PATCH] LUCENE-1790: small refactor of Payload queries, plus add in some new payload query functionality git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@802174 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 6 +- .../org/apache/lucene/search/Similarity.java | 30 +- .../payloads/AveragePayloadFunction.java | 36 +++ .../payloads/BoostingFunctionTermQuery.java | 180 +++++++++++ .../search/payloads/BoostingNearQuery.java | 132 ++++---- .../search/payloads/BoostingTermQuery.java | 91 +----- .../search/payloads/MaxPayloadFunction.java | 34 ++ .../search/payloads/MinPayloadFunction.java | 18 ++ .../search/payloads/PayloadFunction.java | 58 ++++ .../lucene/search/payloads/PayloadQuery.java | 9 + .../BoostingFunctionTermQueryTest.java | 305 ++++++++++++++++++ .../payloads/TestBoostingNearQuery.java | 2 +- .../payloads/TestBoostingTermQuery.java | 2 +- 13 files changed, 756 insertions(+), 147 deletions(-) create mode 100644 src/java/org/apache/lucene/search/payloads/AveragePayloadFunction.java create mode 100644 src/java/org/apache/lucene/search/payloads/BoostingFunctionTermQuery.java create mode 100644 src/java/org/apache/lucene/search/payloads/MaxPayloadFunction.java create mode 100644 src/java/org/apache/lucene/search/payloads/MinPayloadFunction.java create mode 100644 src/java/org/apache/lucene/search/payloads/PayloadFunction.java create mode 100644 src/java/org/apache/lucene/search/payloads/PayloadQuery.java create mode 100644 src/test/org/apache/lucene/search/payloads/BoostingFunctionTermQueryTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 1781fc80510..33530fd668e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -664,7 +664,11 @@ New features disable loading them with a new constructor switch. (Mark Miller) 34. LUCENE-1341: Added BoostingNearQuery to enable SpanNearQuery functionality - with payloads (Peter Keegan, Grant Ingersoll) + with payloads (Peter Keegan, Grant Ingersoll) + +35. LUCENE-1790: Added BoostingFunctionTermQuery to enable scoring of payloads + based on the maximum payload seen for a document. + Slight refactoring of Similarity and other payload queries (Grant Ingersoll) Optimizations diff --git a/src/java/org/apache/lucene/search/Similarity.java b/src/java/org/apache/lucene/search/Similarity.java index fd85015b59c..f7124d4cfe4 100644 --- a/src/java/org/apache/lucene/search/Similarity.java +++ b/src/java/org/apache/lucene/search/Similarity.java @@ -290,6 +290,8 @@ public abstract class Similarity implements Serializable { /** The Similarity implementation used by default. */ private static Similarity defaultImpl = new DefaultSimilarity(); + public static final int NO_DOC_ID_PROVIDED = -1; + /** Set the default Similarity implementation used by indexing and search * code. * @@ -529,6 +531,8 @@ public abstract class Similarity implements Serializable { public abstract float coord(int overlap, int maxOverlap); + + /** * Calculate a scoring factor based on the data in the payload. Overriding implementations * are responsible for interpreting what is in the payload. Lucene makes no assumptions about @@ -540,11 +544,35 @@ public abstract class Similarity implements Serializable { * @param payload The payload byte array to be scored * @param offset The offset into the payload array * @param length The length in the array - * @return An implementation dependent float to be used as a scoring factor + * @return An implementation dependent float to be used as a scoring factor + * + * @deprecated See {@link #scorePayload(int, String, byte[], int, int)} */ public float scorePayload(String fieldName, byte [] payload, int offset, int length) + { + //Do nothing + return scorePayload(NO_DOC_ID_PROVIDED, fieldName, payload, offset, length); + } + + /** + * Calculate a scoring factor based on the data in the payload. Overriding implementations + * are responsible for interpreting what is in the payload. Lucene makes no assumptions about + * what is in the byte array. + *

+ * The default implementation returns 1. + * + * @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information + * @param fieldName The fieldName of the term this payload belongs to + * @param payload The payload byte array to be scored + * @param offset The offset into the payload array + * @param length The length in the array + * @return An implementation dependent float to be used as a scoring factor + * + */ + public float scorePayload(int docId, String fieldName, byte [] payload, int offset, int length) { //Do nothing return 1; } + } diff --git a/src/java/org/apache/lucene/search/payloads/AveragePayloadFunction.java b/src/java/org/apache/lucene/search/payloads/AveragePayloadFunction.java new file mode 100644 index 00000000000..0dcc4387388 --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/AveragePayloadFunction.java @@ -0,0 +1,36 @@ +package org.apache.lucene.search.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Calculate the final score as the average score of all payloads seen. + *

+ * Is thread safe and completely reusable. + * + **/ +public class AveragePayloadFunction extends PayloadFunction{ + + public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) { + return currentPayloadScore + currentScore; + } + + public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) { + return numPayloadsSeen > 0 ? (payloadScore / numPayloadsSeen) : 1; + } + +} diff --git a/src/java/org/apache/lucene/search/payloads/BoostingFunctionTermQuery.java b/src/java/org/apache/lucene/search/payloads/BoostingFunctionTermQuery.java new file mode 100644 index 00000000000..3b1a1fa6701 --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/BoostingFunctionTermQuery.java @@ -0,0 +1,180 @@ +package org.apache.lucene.search.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.QueryWeight; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.ComplexExplanation; +import org.apache.lucene.search.spans.TermSpans; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanWeight; +import org.apache.lucene.search.spans.SpanScorer; + +import java.io.IOException; + + +/** + * The score returned is based on the maximum payload score seen for the Term on the document, as opposed + * to the average as implemented by {@link org.apache.lucene.search.payloads.BoostingTermQuery}. + * + **/ +public class BoostingFunctionTermQuery extends SpanTermQuery implements PayloadQuery{ + protected PayloadFunction function; + private boolean includeSpanScore; + + public BoostingFunctionTermQuery(Term term, PayloadFunction function) { + this(term, function, true); + } + + public BoostingFunctionTermQuery(Term term, PayloadFunction function, boolean includeSpanScore) { + super(term); + this.function = function; + this.includeSpanScore = includeSpanScore; + } + + + + public QueryWeight createQueryWeight(Searcher searcher) throws IOException { + return new BoostingFunctionTermWeight(this, searcher); + } + + protected class BoostingFunctionTermWeight extends SpanWeight { + + public BoostingFunctionTermWeight(BoostingFunctionTermQuery query, Searcher searcher) throws IOException { + super(query, searcher); + } + + public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { + return new BoostingFunctionSpanScorer((TermSpans) query.getSpans(reader), this, + similarity, reader.norms(query.getField())); + } + + protected class BoostingFunctionSpanScorer extends SpanScorer { + //TODO: is this the best way to allocate this? + protected byte[] payload = new byte[256]; + protected TermPositions positions; + protected float payloadScore; + protected int payloadsSeen; + + public BoostingFunctionSpanScorer(TermSpans spans, QueryWeight weight, Similarity similarity, + byte[] norms) throws IOException { + super(spans, weight, similarity, norms); + positions = spans.getPositions(); + } + + protected boolean setFreqCurrentDoc() throws IOException { + if (!more) { + return false; + } + doc = spans.doc(); + freq = 0.0f; + payloadScore = 0; + payloadsSeen = 0; + Similarity similarity1 = getSimilarity(); + while (more && doc == spans.doc()) { + int matchLength = spans.end() - spans.start(); + + freq += similarity1.sloppyFreq(matchLength); + processPayload(similarity1); + + more = spans.next();//this moves positions to the next match in this document + } + return more || (freq != 0); + } + + + protected void processPayload(Similarity similarity) throws IOException { + if (positions.isPayloadAvailable()) { + payload = positions.getPayload(payload, 0); + payloadScore = function.currentScore(doc, term.field(), payloadsSeen, payloadScore, + similarity.scorePayload(doc, term.field(), payload, 0, positions.getPayloadLength())); + payloadsSeen++; + + } else { + //zero out the payload? + } + } + + /** + * + * @return {@link #getSpanScore()} * {@link #getPayloadScore()} + * @throws IOException + */ + public float score() throws IOException { + + return includeSpanScore ? getSpanScore() * getPayloadScore() : getPayloadScore(); + } + + /** + * Returns the SpanScorer score only. + *

+ * Should not be overriden without good cause! + * + * @return the score for just the Span part w/o the payload + * @throws IOException + * + * @see #score() + */ + protected float getSpanScore() throws IOException{ + return super.score(); + } + + /** + * The score for the payload + * @return The score, as calculated by {@link PayloadFunction#docScore(int, String, int, float)} + */ + protected float getPayloadScore() { + return function.docScore(doc, term.field(), payloadsSeen, payloadScore); + } + + + public Explanation explain(final int doc) throws IOException { + ComplexExplanation result = new ComplexExplanation(); + Explanation nonPayloadExpl = super.explain(doc); + result.addDetail(nonPayloadExpl); + //QUESTION: Is there a way to avoid this skipTo call? We need to know whether to load the payload or not + Explanation payloadBoost = new Explanation(); + result.addDetail(payloadBoost); + + + float payloadScore = getPayloadScore(); + payloadBoost.setValue(payloadScore); + //GSI: I suppose we could toString the payload, but I don't think that would be a good idea + payloadBoost.setDescription("scorePayload(...)"); + result.setValue(nonPayloadExpl.getValue() * payloadScore); + result.setDescription("btq, product of:"); + result.setMatch(nonPayloadExpl.getValue()==0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303 + return result; + } + + } + } + + public boolean equals(Object o) { + if (!(o instanceof BoostingFunctionTermQuery)) + return false; + BoostingFunctionTermQuery other = (BoostingFunctionTermQuery) o; + return (this.getBoost() == other.getBoost()) + && this.term.equals(other.term) && this.function.equals(other.function); + } +} diff --git a/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java b/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java index 2ab70d803b7..db8ababf88a 100644 --- a/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java +++ b/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java @@ -23,39 +23,46 @@ import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Weight; -import org.apache.lucene.search.spans.SpanNearQuery; -import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.search.spans.SpanWeight; -import org.apache.lucene.search.spans.SpanScorer; -import org.apache.lucene.search.spans.Spans; import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansUnordered; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanScorer; +import org.apache.lucene.search.spans.SpanWeight; +import org.apache.lucene.search.spans.Spans; import java.io.IOException; -import java.util.Iterator; import java.util.Collection; +import java.util.Iterator; /** * The BoostingNearQuery is very similar to the {@link org.apache.lucene.search.spans.SpanNearQuery} except * that it factors in the value of the payloads located at each of the positions where the * {@link org.apache.lucene.search.spans.TermSpans} occurs. - *

+ *

* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)} * which returns 1 by default. - *

- * Payload scores are averaged across term occurrences in the document. - * + *

+ * Payload scores are averaged across term occurrences in the document. + * * @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int) */ -public class BoostingNearQuery extends SpanNearQuery { - String fieldName; +public class BoostingNearQuery extends SpanNearQuery implements PayloadQuery { + protected String fieldName; + protected PayloadFunction function; public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) { - super(clauses, slop, inOrder); - fieldName = clauses[0].getField(); // all clauses must have same field + this(clauses, slop, inOrder, new AveragePayloadFunction()); } + public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder, PayloadFunction function) { + super(clauses, slop, inOrder); + fieldName = clauses[0].getField(); // all clauses must have same field + this.function = function; + } + + public QueryWeight createQueryWeight(Searcher searcher) throws IOException { return new BoostingSpanWeight(this, searcher); } @@ -70,18 +77,19 @@ public class BoostingNearQuery extends SpanNearQuery { similarity, reader.norms(query.getField())); } + public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { - return new BoostingSpanScorer(query.getSpans(reader), this, - similarity, - reader.norms(query.getField())); + return new BoostingSpanScorer(query.getSpans(reader), this, + similarity, + reader.norms(query.getField())); } } public class BoostingSpanScorer extends SpanScorer { - Spans spans; + Spans spans; Spans[] subSpans = null; protected float payloadScore; - private int payloadsSeen; + private int payloadsSeen; Similarity similarity = getSimilarity(); protected BoostingSpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) @@ -92,58 +100,62 @@ public class BoostingNearQuery extends SpanNearQuery { // Get the payloads associated with all underlying subspans public void getPayloads(Spans[] subSpans) throws IOException { - for (int i = 0; i < subSpans.length; i++) { - if (subSpans[i] instanceof NearSpansOrdered) { - if (((NearSpansOrdered)subSpans[i]).isPayloadAvailable()) { - processPayloads(((NearSpansOrdered)subSpans[i]).getPayload()); - } - getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans()); - } else if (subSpans[i] instanceof NearSpansUnordered) { - if (((NearSpansUnordered)subSpans[i]).isPayloadAvailable()) { - processPayloads(((NearSpansUnordered)subSpans[i]).getPayload()); - } - getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans()); - } - } + for (int i = 0; i < subSpans.length; i++) { + if (subSpans[i] instanceof NearSpansOrdered) { + if (((NearSpansOrdered) subSpans[i]).isPayloadAvailable()) { + processPayloads(((NearSpansOrdered) subSpans[i]).getPayload()); + } + getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans()); + } else if (subSpans[i] instanceof NearSpansUnordered) { + if (((NearSpansUnordered) subSpans[i]).isPayloadAvailable()) { + processPayloads(((NearSpansUnordered) subSpans[i]).getPayload()); + } + getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans()); + } + } } /** * By default, sums the payloads, but can be overridden to do other things. + * * @param payLoads The payloads */ - protected void processPayloads(Collection payLoads) { - for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) { - byte[] thePayload = (byte[]) iterator.next(); - ++payloadsSeen; - payloadScore += similarity.scorePayload(fieldName, thePayload, 0, thePayload.length); - } - } -// - protected boolean setFreqCurrentDoc() throws IOException { - Spans[] spansArr = new Spans[1]; - spansArr[0] = spans; - payloadScore = 0; - payloadsSeen = 0; - getPayloads(spansArr); - return super.setFreqCurrentDoc(); - } + protected void processPayloads(Collection payLoads) { + for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) { + byte[] thePayload = (byte[]) iterator.next(); + payloadScore = function.currentScore(doc, fieldName, payloadsSeen, payloadScore, + similarity.scorePayload(doc, fieldName, thePayload, 0, thePayload.length)); + ++payloadsSeen; + } + } + + // + protected boolean setFreqCurrentDoc() throws IOException { + Spans[] spansArr = new Spans[1]; + spansArr[0] = spans; + payloadScore = 0; + payloadsSeen = 0; + getPayloads(spansArr); + return super.setFreqCurrentDoc(); + } public float score() throws IOException { - return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); + return super.score() * function.docScore(doc, fieldName, payloadsSeen, payloadScore); } + public Explanation explain(int doc) throws IOException { - Explanation result = new Explanation(); - Explanation nonPayloadExpl = super.explain(doc); - result.addDetail(nonPayloadExpl); - Explanation payloadBoost = new Explanation(); - result.addDetail(payloadBoost); - float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); - payloadBoost.setValue(avgPayloadScore); - payloadBoost.setDescription("scorePayload(...)"); - result.setValue(nonPayloadExpl.getValue() * avgPayloadScore); - result.setDescription("bnq, product of:"); - return result; + Explanation result = new Explanation(); + Explanation nonPayloadExpl = super.explain(doc); + result.addDetail(nonPayloadExpl); + Explanation payloadBoost = new Explanation(); + result.addDetail(payloadBoost); + float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); + payloadBoost.setValue(avgPayloadScore); + payloadBoost.setDescription("scorePayload(...)"); + result.setValue(nonPayloadExpl.getValue() * avgPayloadScore); + result.setDescription("bnq, product of:"); + return result; } } diff --git a/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java b/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java index c9a80c062aa..d9d21228e14 100644 --- a/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java +++ b/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java @@ -39,106 +39,31 @@ import java.io.IOException; * * @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int) */ -public class BoostingTermQuery extends SpanTermQuery{ +public class BoostingTermQuery extends BoostingFunctionTermQuery implements PayloadQuery{ public BoostingTermQuery(Term term) { - super(term); + this(term, true); + } + + public BoostingTermQuery(Term term, boolean includeSpanScore) { + super(term, new AveragePayloadFunction(), includeSpanScore); } public QueryWeight createQueryWeight(Searcher searcher) throws IOException { return new BoostingTermWeight(this, searcher); } - protected class BoostingTermWeight extends SpanWeight { + protected class BoostingTermWeight extends BoostingFunctionTermWeight { public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException { super(query, searcher); } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { - return new BoostingSpanScorer((TermSpans) query.getSpans(reader), this, + return new BoostingFunctionSpanScorer((TermSpans) query.getSpans(reader), this, similarity, reader.norms(query.getField())); } - protected class BoostingSpanScorer extends SpanScorer { - - //TODO: is this the best way to allocate this? - byte[] payload = new byte[256]; - private TermPositions positions; - protected float payloadScore; - private int payloadsSeen; - - public BoostingSpanScorer(TermSpans spans, QueryWeight weight, - Similarity similarity, byte[] norms) throws IOException { - super(spans, weight, similarity, norms); - positions = spans.getPositions(); - - } - - protected boolean setFreqCurrentDoc() throws IOException { - if (!more) { - return false; - } - doc = spans.doc(); - freq = 0.0f; - payloadScore = 0; - payloadsSeen = 0; - Similarity similarity1 = getSimilarity(); - while (more && doc == spans.doc()) { - int matchLength = spans.end() - spans.start(); - - freq += similarity1.sloppyFreq(matchLength); - processPayload(similarity1); - - more = spans.next();//this moves positions to the next match in this document - } - return more || (freq != 0); - } - - - protected void processPayload(Similarity similarity) throws IOException { - if (positions.isPayloadAvailable()) { - payload = positions.getPayload(payload, 0); - payloadScore += similarity.scorePayload(term.field(), payload, 0, positions.getPayloadLength()); - payloadsSeen++; - - } else { - //zero out the payload? - } - - } - - public float score() throws IOException { - - return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); - } - - - public Explanation explain(final int doc) throws IOException { - ComplexExplanation result = new ComplexExplanation(); - Explanation nonPayloadExpl = super.explain(doc); - result.addDetail(nonPayloadExpl); - //QUESTION: Is there a wau to avoid this skipTo call? We need to know whether to load the payload or not - - Explanation payloadBoost = new Explanation(); - result.addDetail(payloadBoost); -/* - if (skipTo(doc) == true) { - processPayload(); - } -*/ - - float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); - payloadBoost.setValue(avgPayloadScore); - //GSI: I suppose we could toString the payload, but I don't think that would be a good idea - payloadBoost.setDescription("scorePayload(...)"); - result.setValue(nonPayloadExpl.getValue() * avgPayloadScore); - result.setDescription("btq, product of:"); - result.setMatch(nonPayloadExpl.getValue()==0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303 - return result; - } - } - } diff --git a/src/java/org/apache/lucene/search/payloads/MaxPayloadFunction.java b/src/java/org/apache/lucene/search/payloads/MaxPayloadFunction.java new file mode 100644 index 00000000000..5565299f86b --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/MaxPayloadFunction.java @@ -0,0 +1,34 @@ +package org.apache.lucene.search.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Returns the maximum payload score seen, else 1 if there are no payloads on the doc. + *

+ * Is thread safe and completely reusable. + * + **/ +public class MaxPayloadFunction extends PayloadFunction{ + public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) { + return Math.max(currentPayloadScore, currentScore); + } + + public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) { + return numPayloadsSeen > 0 ? payloadScore : 1; + } +} diff --git a/src/java/org/apache/lucene/search/payloads/MinPayloadFunction.java b/src/java/org/apache/lucene/search/payloads/MinPayloadFunction.java new file mode 100644 index 00000000000..357d2d75269 --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/MinPayloadFunction.java @@ -0,0 +1,18 @@ +package org.apache.lucene.search.payloads; + + +/** + * Calculates the miniumum payload seen + * + **/ +public class MinPayloadFunction extends PayloadFunction { + + public float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore) { + return Math.min(currentPayloadScore, currentScore); + } + + public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) { + return numPayloadsSeen > 0 ? payloadScore : 1; + } + +} diff --git a/src/java/org/apache/lucene/search/payloads/PayloadFunction.java b/src/java/org/apache/lucene/search/payloads/PayloadFunction.java new file mode 100644 index 00000000000..2d0c53e6872 --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/PayloadFunction.java @@ -0,0 +1,58 @@ +package org.apache.lucene.search.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + + +/** + * An abstract class that defines a way for Boosting*Query instances + * to transform the cumulative effects of payload scores for a document. + * + * @see org.apache.lucene.search.payloads.BoostingFunctionTermQuery for more information + * + *

+ * This class and its derivations are experimental and subject to change + * + **/ +public abstract class PayloadFunction implements Serializable { + + + + + /** + * Calculate the score up to this point for this doc and field + * @param docId The current doc + * @param field The current field + * @param numPayloadsSeen The number of payloads seen so far + * @param currentScore The current score so far + * @param currentPayloadScore The score for the current payload + * @return The new current score + */ + public abstract float currentScore(int docId, String field, int numPayloadsSeen, float currentScore, float currentPayloadScore); + + /** + * Calculate the final score for all the payloads seen so far for this doc/field + * @param docId The current doc + * @param field The current field + * @param numPayloadsSeen The total number of payloads seen on this document + * @param payloadScore The raw score for those payloads + * @return The final score for the payloads + */ + public abstract float docScore(int docId, String field, int numPayloadsSeen, float payloadScore); + +} diff --git a/src/java/org/apache/lucene/search/payloads/PayloadQuery.java b/src/java/org/apache/lucene/search/payloads/PayloadQuery.java new file mode 100644 index 00000000000..e70821f3ddd --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/PayloadQuery.java @@ -0,0 +1,9 @@ +package org.apache.lucene.search.payloads; + + +/** + * Marker interface inidcating this Query is Payload aware + * + **/ +public interface PayloadQuery { +} diff --git a/src/test/org/apache/lucene/search/payloads/BoostingFunctionTermQueryTest.java b/src/test/org/apache/lucene/search/payloads/BoostingFunctionTermQueryTest.java new file mode 100644 index 00000000000..f6a1e26c892 --- /dev/null +++ b/src/test/org/apache/lucene/search/payloads/BoostingFunctionTermQueryTest.java @@ -0,0 +1,305 @@ +package org.apache.lucene.search.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.English; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.search.spans.TermSpans; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.index.Payload; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import java.io.Reader; +import java.io.IOException; + + +/** + * + * + **/ +public class BoostingFunctionTermQueryTest extends LuceneTestCase { + private IndexSearcher searcher; + private BoostingSimilarity similarity = new BoostingSimilarity(); + private byte[] payloadField = new byte[]{1}; + private byte[] payloadMultiField1 = new byte[]{2}; + private byte[] payloadMultiField2 = new byte[]{4}; + protected RAMDirectory directory; + + public BoostingFunctionTermQueryTest(String s) { + super(s); + } + + private class PayloadAnalyzer extends Analyzer { + + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new LowerCaseTokenizer(reader); + result = new PayloadFilter(result, fieldName); + return result; + } + } + + private class PayloadFilter extends TokenFilter { + String fieldName; + int numSeen = 0; + + PayloadAttribute payloadAtt; + + public PayloadFilter(TokenStream input, String fieldName) { + super(input); + this.fieldName = fieldName; + payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + } + + public boolean incrementToken() throws IOException { + boolean hasNext = input.incrementToken(); + if (hasNext) { + if (fieldName.equals("field")) { + payloadAtt.setPayload(new Payload(payloadField)); + } else if (fieldName.equals("multiField")) { + if (numSeen % 2 == 0) { + payloadAtt.setPayload(new Payload(payloadMultiField1)); + } else { + payloadAtt.setPayload(new Payload(payloadMultiField2)); + } + numSeen++; + } + return true; + } else { + return false; + } + } + } + + protected void setUp() throws Exception { + super.setUp(); + directory = new RAMDirectory(); + PayloadAnalyzer analyzer = new PayloadAnalyzer(); + IndexWriter writer + = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + writer.setSimilarity(similarity); + //writer.infoStream = System.out; + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + Field noPayloadField = new Field(PayloadHelper.NO_PAYLOAD_FIELD, English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED); + //noPayloadField.setBoost(0); + doc.add(noPayloadField); + doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("multiField", English.intToEnglish(i) + " " + English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + writer.optimize(); + writer.close(); + + searcher = new IndexSearcher(directory, true); + searcher.setSimilarity(similarity); + } + + public void test() throws IOException { + BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term("field", "seventy"), + new MaxPayloadFunction()); + TopDocs hits = searcher.search(query, null, 100); + assertTrue("hits is null and it shouldn't be", hits != null); + assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100); + + //they should all have the exact same score, because they all contain seventy once, and we set + //all the other similarity factors to be 1 + + assertTrue(hits.getMaxScore() + " does not equal: " + 1, hits.getMaxScore() == 1); + for (int i = 0; i < hits.scoreDocs.length; i++) { + ScoreDoc doc = hits.scoreDocs[i]; + assertTrue(doc.score + " does not equal: " + 1, doc.score == 1); + } + CheckHits.checkExplanations(query, PayloadHelper.FIELD, searcher, true); + Spans spans = query.getSpans(searcher.getIndexReader()); + assertTrue("spans is null and it shouldn't be", spans != null); + assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans); + /*float score = hits.score(0); + for (int i =1; i < hits.length(); i++) + { + assertTrue("scores are not equal and they should be", score == hits.score(i)); + }*/ + + } + + public void testMultipleMatchesPerDoc() throws Exception { + BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.MULTI_FIELD, "seventy"), + new MaxPayloadFunction()); + TopDocs hits = searcher.search(query, null, 100); + assertTrue("hits is null and it shouldn't be", hits != null); + assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100); + + //they should all have the exact same score, because they all contain seventy once, and we set + //all the other similarity factors to be 1 + + //System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash); + assertTrue(hits.getMaxScore() + " does not equal: " + 4.0, hits.getMaxScore() == 4.0); + //there should be exactly 10 items that score a 4, all the rest should score a 2 + //The 10 items are: 70 + i*100 where i in [0-9] + int numTens = 0; + for (int i = 0; i < hits.scoreDocs.length; i++) { + ScoreDoc doc = hits.scoreDocs[i]; + if (doc.doc % 10 == 0) { + numTens++; + assertTrue(doc.score + " does not equal: " + 4.0, doc.score == 4.0); + } else { + assertTrue(doc.score + " does not equal: " + 2, doc.score == 2); + } + } + assertTrue(numTens + " does not equal: " + 10, numTens == 10); + CheckHits.checkExplanations(query, "field", searcher, true); + Spans spans = query.getSpans(searcher.getIndexReader()); + assertTrue("spans is null and it shouldn't be", spans != null); + assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans); + //should be two matches per document + int count = 0; + //100 hits times 2 matches per hit, we should have 200 in count + while (spans.next()) { + count++; + } + assertTrue(count + " does not equal: " + 200, count == 200); + } + + //Set includeSpanScore to false, in which case just the payload score comes through. + public void testIgnoreSpanScorer() throws Exception { + BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.MULTI_FIELD, "seventy"), + new MaxPayloadFunction(), false); + + IndexSearcher theSearcher = new IndexSearcher(directory, true); + theSearcher.setSimilarity(new FullSimilarity()); + TopDocs hits = searcher.search(query, null, 100); + assertTrue("hits is null and it shouldn't be", hits != null); + assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100); + + //they should all have the exact same score, because they all contain seventy once, and we set + //all the other similarity factors to be 1 + + //System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash); + assertTrue(hits.getMaxScore() + " does not equal: " + 4.0, hits.getMaxScore() == 4.0); + //there should be exactly 10 items that score a 4, all the rest should score a 2 + //The 10 items are: 70 + i*100 where i in [0-9] + int numTens = 0; + for (int i = 0; i < hits.scoreDocs.length; i++) { + ScoreDoc doc = hits.scoreDocs[i]; + if (doc.doc % 10 == 0) { + numTens++; + assertTrue(doc.score + " does not equal: " + 4.0, doc.score == 4.0); + } else { + assertTrue(doc.score + " does not equal: " + 2, doc.score == 2); + } + } + assertTrue(numTens + " does not equal: " + 10, numTens == 10); + CheckHits.checkExplanations(query, "field", searcher, true); + Spans spans = query.getSpans(searcher.getIndexReader()); + assertTrue("spans is null and it shouldn't be", spans != null); + assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans); + //should be two matches per document + int count = 0; + //100 hits times 2 matches per hit, we should have 200 in count + while (spans.next()) { + count++; + } + } + + public void testNoMatch() throws Exception { + BoostingFunctionTermQuery query = new BoostingFunctionTermQuery(new Term(PayloadHelper.FIELD, "junk"), + new MaxPayloadFunction()); + TopDocs hits = searcher.search(query, null, 100); + assertTrue("hits is null and it shouldn't be", hits != null); + assertTrue("hits Size: " + hits.totalHits + " is not: " + 0, hits.totalHits == 0); + + } + + public void testNoPayload() throws Exception { + BoostingFunctionTermQuery q1 = new BoostingFunctionTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "zero"), + new MaxPayloadFunction()); + BoostingFunctionTermQuery q2 = new BoostingFunctionTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "foo"), + new MaxPayloadFunction()); + BooleanClause c1 = new BooleanClause(q1, BooleanClause.Occur.MUST); + BooleanClause c2 = new BooleanClause(q2, BooleanClause.Occur.MUST_NOT); + BooleanQuery query = new BooleanQuery(); + query.add(c1); + query.add(c2); + TopDocs hits = searcher.search(query, null, 100); + assertTrue("hits is null and it shouldn't be", hits != null); + assertTrue("hits Size: " + hits.totalHits + " is not: " + 1, hits.totalHits == 1); + int[] results = new int[1]; + results[0] = 0;//hits.scoreDocs[0].doc; + CheckHits.checkHitCollector(query, PayloadHelper.NO_PAYLOAD_FIELD, searcher, results); + } + + // must be static for weight serialization tests + static class BoostingSimilarity extends DefaultSimilarity { + + // TODO: Remove warning after API has been finalized + public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) { + //we know it is size 4 here, so ignore the offset/length + return payload[0]; + } + + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + //Make everything else 1 so we see the effect of the payload + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + public float lengthNorm(String fieldName, int numTerms) { + return 1; + } + + public float queryNorm(float sumOfSquaredWeights) { + return 1; + } + + public float sloppyFreq(int distance) { + return 1; + } + + public float coord(int overlap, int maxOverlap) { + return 1; + } + + public float idf(int docFreq, int numDocs) { + return 1; + } + + public float tf(float freq) { + return freq == 0 ? 0 : 1; + } + } + + static class FullSimilarity extends DefaultSimilarity{ + public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) { + //we know it is size 4 here, so ignore the offset/length + return payload[0]; + } + } + +} diff --git a/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java b/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java index 5e8e9639493..5f47f9bed4f 100644 --- a/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java +++ b/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java @@ -184,7 +184,7 @@ public class TestBoostingNearQuery extends LuceneTestCase { // must be static for weight serialization tests static class BoostingSimilarity extends DefaultSimilarity { - public float scorePayload(String fieldName, byte[] payload, int offset, int length) { + public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) { return payload[0]; } diff --git a/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java b/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java index 2bc3a831e9e..414b5901430 100644 --- a/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java +++ b/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java @@ -210,7 +210,7 @@ public class TestBoostingTermQuery extends LuceneTestCase { static class BoostingSimilarity extends DefaultSimilarity { // TODO: Remove warning after API has been finalized - public float scorePayload(String fieldName, byte[] payload, int offset, int length) { + public float scorePayload(int docId, String fieldName, byte[] payload, int offset, int length) { //we know it is size 4 here, so ignore the offset/length return payload[0]; }