LUCENE-1341: Added BoostingNearQuery

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@801667 13f79535-47bb-0310-9956-ffa450edef68
2009-08-06 15:14:18 +00:00 · 2009-08-06 15:14:18 +00:00 · d4eeefff81
parent fc369cdf2f
commit d4eeefff81
6 changed files with 392 additions and 4 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -662,6 +662,9 @@ New features
    the javadocs implied. If you have payloads and want to use an ordered
    SpanNearQuery that does not need to use the payloads, you can
    disable loading them with a new constructor switch.  (Mark Miller)
+
+34. LUCENE-1341: Added BoostingNearQuery to enable SpanNearQuery functionality
+  with payloads (Peter Keegan, Grant Ingersoll)    
   
 Optimizations

--- a/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java
+++ b/src/java/org/apache/lucene/search/payloads/BoostingNearQuery.java
@ -0,0 +1,151 @@
+package org.apache.lucene.search.payloads;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.QueryWeight;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanWeight;
+import org.apache.lucene.search.spans.SpanScorer;
+import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.search.spans.NearSpansOrdered;
+import org.apache.lucene.search.spans.NearSpansUnordered;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Collection;
+
+/**
+ * The BoostingNearQuery is very similar to the {@link org.apache.lucene.search.spans.SpanNearQuery} except
+ * that it factors in the value of the payloads located at each of the positions where the
+ * {@link org.apache.lucene.search.spans.TermSpans} occurs.
+ * <p>
+ * In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)}
+ * which returns 1 by default.
+ * <p>
+ * Payload scores are averaged across term occurrences in the document.  
+ * 
+ * @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int)
+ */
+
+public class BoostingNearQuery extends SpanNearQuery {
+	String fieldName;
+
+  public BoostingNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) {
+    super(clauses, slop, inOrder);
+    fieldName = clauses[0].getField(); // all clauses must have same field 
+  }
+
+  public QueryWeight createQueryWeight(Searcher searcher) throws IOException {
+    return new BoostingSpanWeight(this, searcher);
+  }
+
+  public class BoostingSpanWeight extends SpanWeight {
+    public BoostingSpanWeight(SpanQuery query, Searcher searcher) throws IOException {
+      super(query, searcher);
+    }
+
+    public Scorer scorer(IndexReader reader) throws IOException {
+      return new BoostingSpanScorer(query.getSpans(reader), this,
+              similarity,
+              reader.norms(query.getField()));
+    }
+    public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
+        return new BoostingSpanScorer(query.getSpans(reader), this,
+                similarity,
+                reader.norms(query.getField()));
+    }
+  }
+
+  public class BoostingSpanScorer extends SpanScorer {
+	Spans spans;
+    Spans[] subSpans = null;
+    protected float payloadScore;
+    private int payloadsSeen;    
+    Similarity similarity = getSimilarity();
+
+    protected BoostingSpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
+            throws IOException {
+      super(spans, weight, similarity, norms);
+      this.spans = spans;
+    }
+
+    // Get the payloads associated with all underlying subspans
+    public void getPayloads(Spans[] subSpans) throws IOException {
+    	for (int i = 0; i < subSpans.length; i++) {
+    		if (subSpans[i] instanceof NearSpansOrdered) {
+    			if (((NearSpansOrdered)subSpans[i]).isPayloadAvailable()) {
+    				processPayloads(((NearSpansOrdered)subSpans[i]).getPayload());
+    			}
+    			getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans());
+    		} else if (subSpans[i] instanceof NearSpansUnordered) {
+    			if (((NearSpansUnordered)subSpans[i]).isPayloadAvailable()) {
+    				processPayloads(((NearSpansUnordered)subSpans[i]).getPayload());
+    			}
+    			getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans());
+    		}
+    	}
+    }
+
+    /**
+     * By default, sums the payloads, but can be overridden to do other things.
+     * @param payLoads The payloads
+     */
+   protected void processPayloads(Collection payLoads) {
+       for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) {
+           byte[] thePayload = (byte[]) iterator.next();
+           ++payloadsSeen;
+           payloadScore += similarity.scorePayload(fieldName, thePayload, 0, thePayload.length);
+       }
+   }
+//
+   protected boolean setFreqCurrentDoc() throws IOException {
+       Spans[] spansArr = new Spans[1];
+       spansArr[0] = spans;
+       payloadScore = 0;
+       payloadsSeen = 0;        
+       getPayloads(spansArr);
+       return super.setFreqCurrentDoc();
+   }
+
+    public float score() throws IOException {
+
+    	return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
+    }
+    public Explanation explain(int doc) throws IOException {
+    	Explanation result = new Explanation();
+    	Explanation nonPayloadExpl = super.explain(doc);
+    	result.addDetail(nonPayloadExpl);
+    	Explanation payloadBoost = new Explanation();
+    	result.addDetail(payloadBoost);
+    	float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
+    	payloadBoost.setValue(avgPayloadScore);
+    	payloadBoost.setDescription("scorePayload(...)");
+    	result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
+    	result.setDescription("bnq, product of:");
+    	return result;
+    }
+  }
+
+
+}
--- a/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
+++ b/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java
@ -46,8 +46,12 @@ import java.util.Set;
 * matches twice:
 * <pre>t1 t2 .. t3      </pre>
 * <pre>      t1 .. t2 t3</pre>
+ *
+ *
+ * Expert:
+ * Only public for subclassing.  Most implementations should not need this class
 */
-class NearSpansOrdered implements Spans {
+public class NearSpansOrdered implements Spans {
  private final int allowedSlop;
  private boolean firstTime = true;
  private boolean more = false;
@ -104,6 +108,10 @@ class NearSpansOrdered implements Spans {

  // inherit javadocs
  public int end() { return matchEnd; }
+  
+  public Spans[] getSubSpans() {
+	  return subSpans;
+  }  

  // TODO: Remove warning after API has been finalized
  // TODO: Would be nice to be able to lazy load payloads
--- a/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
+++ b/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java
@ -27,10 +27,15 @@ import java.util.List;
 import java.util.Set;
 import java.util.HashSet;

-class NearSpansUnordered implements Spans {
+/**
+ * Expert:
+ * Only public for subclassing.  Most implementations should not need this class
+ */
+public class NearSpansUnordered implements Spans {
  private SpanNearQuery query;

  private List ordered = new ArrayList();         // spans in query order
+  private Spans[] subSpans;  
  private int slop;                               // from query

  private SpansCell first;                        // linked list of spans
@ -122,13 +127,17 @@ class NearSpansUnordered implements Spans {

    SpanQuery[] clauses = query.getClauses();
    queue = new CellQueue(clauses.length);
+    subSpans = new Spans[clauses.length];    
    for (int i = 0; i < clauses.length; i++) {
      SpansCell cell =
        new SpansCell(clauses[i].getSpans(reader), i);
      ordered.add(cell);
+      subSpans[i] = cell.spans;
    }
  }
-
+  public Spans[] getSubSpans() {
+	  return subSpans;
+  }
  public boolean next() throws IOException {
    if (firstTime) {
      initList(true);
--- a/src/java/org/apache/lucene/search/spans/SpanNearQuery.java
+++ b/src/java/org/apache/lucene/search/spans/SpanNearQuery.java
@ -38,7 +38,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
  private int slop;
  private boolean inOrder;

-  private String field;
+  protected String field;
  private boolean collectPayloads;

  /** Construct a SpanNearQuery.  Matches spans matching a span from each
--- a/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java
+++ b/src/test/org/apache/lucene/search/payloads/TestBoostingNearQuery.java
@ -0,0 +1,217 @@
+package org.apache.lucene.search.payloads;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestBoostingNearQuery extends LuceneTestCase {
+	private IndexSearcher searcher;
+	private BoostingSimilarity similarity = new BoostingSimilarity();
+	private byte[] payload2 = new byte[]{2};
+	private byte[] payload4 = new byte[]{4};
+
+	public TestBoostingNearQuery(String s) {
+		super(s);
+	}
+
+	private class PayloadAnalyzer extends Analyzer {
+		public TokenStream tokenStream(String fieldName, Reader reader) {
+			TokenStream result = new LowerCaseTokenizer(reader);
+			result = new PayloadFilter(result, fieldName);
+			return result;
+		}
+	}
+
+	private class PayloadFilter extends TokenFilter {
+		String fieldName;
+		int numSeen = 0;
+    protected PayloadAttribute payAtt;
+
+		public PayloadFilter(TokenStream input, String fieldName) {
+			super(input);
+			this.fieldName = fieldName;
+      payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
+		}
+
+    public boolean incrementToken() throws IOException {
+      boolean result = false;
+      if (input.incrementToken() == true){
+        if (numSeen % 2 == 0) {
+					payAtt.setPayload(new Payload(payload2));
+				} else {
+					payAtt.setPayload(new Payload(payload4));
+				}
+				numSeen++;
+        result = true;
+      }
+      return result;
+    }
+  }
+  
+	private BoostingNearQuery newPhraseQuery (String fieldName, String phrase, boolean inOrder) {
+		int n;
+		String[] words = phrase.split("[\\s]+");
+		SpanQuery clauses[] = new SpanQuery[words.length];
+		for (int i=0;i<clauses.length;i++) {
+			clauses[i] = new BoostingTermQuery(new Term(fieldName, words[i]));  
+		} 
+		return new BoostingNearQuery(clauses, 0, inOrder);
+	}
+
+	protected void setUp() throws Exception {
+		super.setUp();
+		RAMDirectory directory = new RAMDirectory();
+		PayloadAnalyzer analyzer = new PayloadAnalyzer();
+		IndexWriter writer
+		= new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+		writer.setSimilarity(similarity);
+		//writer.infoStream = System.out;
+		for (int i = 0; i < 1000; i++) {
+			Document doc = new Document();
+			doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
+			writer.addDocument(doc);
+		}
+		writer.optimize();
+		writer.close();
+
+		searcher = new IndexSearcher(directory, true);
+		searcher.setSimilarity(similarity);
+	}
+
+	public void test() throws IOException {
+		BoostingNearQuery query;
+		TopDocs hits;
+
+		query = newPhraseQuery("field", "twenty two", true);
+		// all 10 hits should have score = 3 because adjacent terms have payloads of 2,4
+		// and all the similarity factors are set to 1
+		hits = searcher.search(query, null, 100);
+		assertTrue("hits is null and it shouldn't be", hits != null);
+		assertTrue("should be 10 hits", hits.totalHits == 10);
+		for (int j = 0; j < hits.scoreDocs.length; j++) {
+			ScoreDoc doc = hits.scoreDocs[j];
+			assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
+		}
+		for (int i=1;i<10;i++) {
+			query = newPhraseQuery("field", English.intToEnglish(i)+" hundred", true);
+			// all should have score = 3 because adjacent terms have payloads of 2,4
+			// and all the similarity factors are set to 1
+			hits = searcher.search(query, null, 100);
+			assertTrue("hits is null and it shouldn't be", hits != null);
+			assertTrue("should be 100 hits", hits.totalHits == 100);
+			for (int j = 0; j < hits.scoreDocs.length; j++) {
+				ScoreDoc doc = hits.scoreDocs[j];
+//				System.out.println("Doc: " + doc.toString());
+//				System.out.println("Explain: " + searcher.explain(query, doc.doc));
+				assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
+			}
+		}
+	}
+
+	public void testLongerSpan() throws IOException {
+		BoostingNearQuery query;
+		TopDocs hits;
+		query = newPhraseQuery("field", "nine hundred ninety nine", true);
+		hits = searcher.search(query, null, 100);
+		ScoreDoc doc = hits.scoreDocs[0];
+//		System.out.println("Doc: " + doc.toString());
+//		System.out.println("Explain: " + searcher.explain(query, doc.doc));
+		assertTrue("hits is null and it shouldn't be", hits != null);
+		assertTrue("there should only be one hit", hits.totalHits == 1);
+		// should have score = 3 because adjacent terms have payloads of 2,4
+		assertTrue(doc.score + " does not equal: " + 3, doc.score == 3); 
+	}
+
+	public void testComplexNested() throws IOException {
+		BoostingNearQuery query;
+		TopDocs hits;
+
+		// combine ordered and unordered spans with some nesting to make sure all payloads are counted
+
+		SpanQuery q1 = newPhraseQuery("field", "nine hundred", true);
+		SpanQuery q2 = newPhraseQuery("field", "ninety nine", true);
+		SpanQuery q3 = newPhraseQuery("field", "nine ninety", false);
+		SpanQuery q4 = newPhraseQuery("field", "hundred nine", false);
+		SpanQuery[]clauses = new SpanQuery[] {new BoostingNearQuery(new SpanQuery[] {q1,q2}, 0, true), new BoostingNearQuery(new SpanQuery[] {q3,q4}, 0, false)};
+		query = new BoostingNearQuery(clauses, 0, false);
+		hits = searcher.search(query, null, 100);
+		assertTrue("hits is null and it shouldn't be", hits != null);
+		// should be only 1 hit - doc 999
+		assertTrue("should only be one hit", hits.scoreDocs.length == 1);
+		// the score should be 3 - the average of all the underlying payloads
+		ScoreDoc doc = hits.scoreDocs[0];
+//		System.out.println("Doc: " + doc.toString());
+//		System.out.println("Explain: " + searcher.explain(query, doc.doc));
+		assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);  
+	}
+	// must be static for weight serialization tests 
+	static class BoostingSimilarity extends DefaultSimilarity {
+
+		public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
+			return payload[0];
+		}
+
+		//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+		//Make everything else 1 so we see the effect of the payload
+		//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+		public float lengthNorm(String fieldName, int numTerms) {
+			return 1;
+		}
+
+		public float queryNorm(float sumOfSquaredWeights) {
+			return 1;
+		}
+
+		public float sloppyFreq(int distance) {
+			return 1;
+		}
+
+		public float coord(int overlap, int maxOverlap) {
+			return 1;
+		}
+		public float tf(float freq) {
+			return 1;
+		}
+		// idf used for phrase queries
+		public float idf(Collection terms, Searcher searcher) {
+			return 1;
+		}
+	}
+}