From d4eeefff81d0742da75256bd6f3767397b37a9dd Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Thu, 6 Aug 2009 15:14:18 +0000 Subject: [PATCH] LUCENE-1341: Added BoostingNearQuery git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@801667 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + .../search/payloads/BoostingNearQuery.java | 151 ++++++++++++ .../lucene/search/spans/NearSpansOrdered.java | 10 +- .../search/spans/NearSpansUnordered.java | 13 +- .../lucene/search/spans/SpanNearQuery.java | 2 +- .../payloads/TestBoostingNearQuery.java | 217 ++++++++++++++++++ 6 files changed, 392 insertions(+), 4 deletions(-) create mode 100644 src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java create mode 100644 src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java diff --git a/CHANGES.txt b/CHANGES.txt index 456f0b08442..1781fc80510 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -662,6 +662,9 @@ New features the javadocs implied. If you have payloads and want to use an ordered SpanNearQuery that does not need to use the payloads, you can disable loading them with a new constructor switch. (Mark Miller) + +34. LUCENE-1341: Added BoostingNearQuery to enable SpanNearQuery functionality + with payloads (Peter Keegan, Grant Ingersoll) Optimizations diff --git a/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java b/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java new file mode 100644 index 00000000000..2ab70d803b7 --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java @@ -0,0 +1,151 @@ +package org.apache.lucene.search.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.QueryWeight; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanWeight; +import org.apache.lucene.search.spans.SpanScorer; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.search.spans.NearSpansOrdered; +import org.apache.lucene.search.spans.NearSpansUnordered; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Collection; + +/** + * The BoostingNearQuery is very similar to the {@link org.apache.lucene.search.spans.SpanNearQuery} except + * that it factors in the value of the payloads located at each of the positions where the + * {@link org.apache.lucene.search.spans.TermSpans} occurs. + *

+ * In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)} + * which returns 1 by default. + *

+ * Payload scores are averaged across term occurrences in the document. + * + * @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int) + */ + +public class BoostingNearQuery extends SpanNearQuery { + String fieldName; + + public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) { + super(clauses, slop, inOrder); + fieldName = clauses[0].getField(); // all clauses must have same field + } + + public QueryWeight createQueryWeight(Searcher searcher) throws IOException { + return new BoostingSpanWeight(this, searcher); + } + + public class BoostingSpanWeight extends SpanWeight { + public BoostingSpanWeight(SpanQuery query, Searcher searcher) throws IOException { + super(query, searcher); + } + + public Scorer scorer(IndexReader reader) throws IOException { + return new BoostingSpanScorer(query.getSpans(reader), this, + similarity, + reader.norms(query.getField())); + } + public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { + return new BoostingSpanScorer(query.getSpans(reader), this, + similarity, + reader.norms(query.getField())); + } + } + + public class BoostingSpanScorer extends SpanScorer { + Spans spans; + Spans[] subSpans = null; + protected float payloadScore; + private int payloadsSeen; + Similarity similarity = getSimilarity(); + + protected BoostingSpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) + throws IOException { + super(spans, weight, similarity, norms); + this.spans = spans; + } + + // Get the payloads associated with all underlying subspans + public void getPayloads(Spans[] subSpans) throws IOException { + for (int i = 0; i < subSpans.length; i++) { + if (subSpans[i] instanceof NearSpansOrdered) { + if (((NearSpansOrdered)subSpans[i]).isPayloadAvailable()) { + processPayloads(((NearSpansOrdered)subSpans[i]).getPayload()); + } + getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans()); + } else if (subSpans[i] instanceof NearSpansUnordered) { + if (((NearSpansUnordered)subSpans[i]).isPayloadAvailable()) { + processPayloads(((NearSpansUnordered)subSpans[i]).getPayload()); + } + getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans()); + } + } + } + + /** + * By default, sums the payloads, but can be overridden to do other things. + * @param payLoads The payloads + */ + protected void processPayloads(Collection payLoads) { + for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) { + byte[] thePayload = (byte[]) iterator.next(); + ++payloadsSeen; + payloadScore += similarity.scorePayload(fieldName, thePayload, 0, thePayload.length); + } + } +// + protected boolean setFreqCurrentDoc() throws IOException { + Spans[] spansArr = new Spans[1]; + spansArr[0] = spans; + payloadScore = 0; + payloadsSeen = 0; + getPayloads(spansArr); + return super.setFreqCurrentDoc(); + } + + public float score() throws IOException { + + return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); + } + public Explanation explain(int doc) throws IOException { + Explanation result = new Explanation(); + Explanation nonPayloadExpl = super.explain(doc); + result.addDetail(nonPayloadExpl); + Explanation payloadBoost = new Explanation(); + result.addDetail(payloadBoost); + float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); + payloadBoost.setValue(avgPayloadScore); + payloadBoost.setDescription("scorePayload(...)"); + result.setValue(nonPayloadExpl.getValue() * avgPayloadScore); + result.setDescription("bnq, product of:"); + return result; + } + } + + +} diff --git a/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java b/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java index e80b1031227..9d0a760b569 100644 --- a/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java +++ b/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java @@ -46,8 +46,12 @@ import java.util.Set; * matches twice: *

t1 t2 .. t3      
*
      t1 .. t2 t3
+ * + * + * Expert: + * Only public for subclassing. Most implementations should not need this class */ -class NearSpansOrdered implements Spans { +public class NearSpansOrdered implements Spans { private final int allowedSlop; private boolean firstTime = true; private boolean more = false; @@ -104,6 +108,10 @@ class NearSpansOrdered implements Spans { // inherit javadocs public int end() { return matchEnd; } + + public Spans[] getSubSpans() { + return subSpans; + } // TODO: Remove warning after API has been finalized // TODO: Would be nice to be able to lazy load payloads diff --git a/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java b/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java index 67e2c5ade53..bef60fa3c45 100644 --- a/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java +++ b/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java @@ -27,10 +27,15 @@ import java.util.List; import java.util.Set; import java.util.HashSet; -class NearSpansUnordered implements Spans { +/** + * Expert: + * Only public for subclassing. Most implementations should not need this class + */ +public class NearSpansUnordered implements Spans { private SpanNearQuery query; private List ordered = new ArrayList(); // spans in query order + private Spans[] subSpans; private int slop; // from query private SpansCell first; // linked list of spans @@ -122,13 +127,17 @@ class NearSpansUnordered implements Spans { SpanQuery[] clauses = query.getClauses(); queue = new CellQueue(clauses.length); + subSpans = new Spans[clauses.length]; for (int i = 0; i < clauses.length; i++) { SpansCell cell = new SpansCell(clauses[i].getSpans(reader), i); ordered.add(cell); + subSpans[i] = cell.spans; } } - + public Spans[] getSubSpans() { + return subSpans; + } public boolean next() throws IOException { if (firstTime) { initList(true); diff --git a/src/java/org/apache/lucene/search/spans/SpanNearQuery.java b/src/java/org/apache/lucene/search/spans/SpanNearQuery.java index a7509970643..84830611cff 100644 --- a/src/java/org/apache/lucene/search/spans/SpanNearQuery.java +++ b/src/java/org/apache/lucene/search/spans/SpanNearQuery.java @@ -38,7 +38,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { private int slop; private boolean inOrder; - private String field; + protected String field; private boolean collectPayloads; /** Construct a SpanNearQuery. Matches spans matching a span from each diff --git a/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java b/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java new file mode 100644 index 00000000000..5e8e9639493 --- /dev/null +++ b/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java @@ -0,0 +1,217 @@ +package org.apache.lucene.search.payloads; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.io.Reader; +import java.util.Collection; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Payload; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + + +public class TestBoostingNearQuery extends LuceneTestCase { + private IndexSearcher searcher; + private BoostingSimilarity similarity = new BoostingSimilarity(); + private byte[] payload2 = new byte[]{2}; + private byte[] payload4 = new byte[]{4}; + + public TestBoostingNearQuery(String s) { + super(s); + } + + private class PayloadAnalyzer extends Analyzer { + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new LowerCaseTokenizer(reader); + result = new PayloadFilter(result, fieldName); + return result; + } + } + + private class PayloadFilter extends TokenFilter { + String fieldName; + int numSeen = 0; + protected PayloadAttribute payAtt; + + public PayloadFilter(TokenStream input, String fieldName) { + super(input); + this.fieldName = fieldName; + payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); + } + + public boolean incrementToken() throws IOException { + boolean result = false; + if (input.incrementToken() == true){ + if (numSeen % 2 == 0) { + payAtt.setPayload(new Payload(payload2)); + } else { + payAtt.setPayload(new Payload(payload4)); + } + numSeen++; + result = true; + } + return result; + } + } + + private BoostingNearQuery newPhraseQuery (String fieldName, String phrase, boolean inOrder) { + int n; + String[] words = phrase.split("[\\s]+"); + SpanQuery clauses[] = new SpanQuery[words.length]; + for (int i=0;i