LUCENE-834:

Added in payloads search package, with one Query implementation: BoostingTermQuery.

Added isPayloadAvailable() method to TermPositions and implementations.

Modified access rights to some of the spans classes so that they could be accessed from the payloads package.

All tests pass.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@523302 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-03-28 12:58:15 +00:00
parent 4f1d6582d7
commit b591bd8efb
15 changed files with 609 additions and 121 deletions

View File

@ -34,6 +34,9 @@ API Changes
throw AlreadyClosedException if they are accessed after being
closed. (Mike McCandless)
5. LUCENE-834: Changed some access levels for certain Span classes to allow them to be overridden. They have
been marked expert only and not for public consumption. (Grant Ingersoll)
Bug fixes
1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist. (Doron Cohen)
@ -101,6 +104,12 @@ New features
contain appropriate warnings in the javadocs.
(Michael Busch)
4. LUCENE-834: Added BoostingTermQuery which can boost scores based on the values of a payload (see #3 above.) (Grant Ingersoll)
5. LUCENE-834: Similarity has a new method for scoring payloads called scorePayloads that can be overridden to take advantage
of payload storage (see #3 above)
6. LUCENE-834: Added isPayloadAvailable() onto TermPositions interface and implemented it in the appropriate places (Grant Ingersoll)
Optimizations
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions

View File

@ -20,7 +20,6 @@ package org.apache.lucene.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import java.io.IOException;
import java.util.Collection;
@ -70,6 +69,12 @@ public class FilterIndexReader extends IndexReader {
public byte[] getPayload(byte[] data, int offset) throws IOException {
return ((TermPositions) this.in).getPayload(data, offset);
}
// TODO: Remove warning after API has been finalized
public boolean isPayloadAvailable() {
return ((TermPositions)this.in).isPayloadAvailable();
}
}
/** Base class for filtering {@link TermEnum} implementations. */

View File

@ -463,4 +463,10 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions {
public byte[] getPayload(byte[] data, int offset) throws IOException {
return ((TermPositions)current).getPayload(data, offset);
}
// TODO: Remove warning after API has been finalized
public boolean isPayloadAvailable() {
return ((TermPositions) current).isPayloadAvailable();
}
}

View File

@ -17,14 +17,14 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.util.PriorityQueue;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.util.PriorityQueue;
/**
* Describe class <code>MultipleTermPositions</code> here.
*
@ -209,5 +209,12 @@ public class MultipleTermPositions implements TermPositions {
throw new UnsupportedOperationException();
}
/**
*
* @return false
*/
// TODO: Remove warning after API has been finalized
public boolean isPayloadAvailable() {
return false;
}
}

View File

@ -18,22 +18,12 @@ package org.apache.lucene.index;
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.document.Fieldable;
import java.io.IOException;
import java.util.SortedMap;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.Collection;
import java.util.Iterator;
import java.util.Enumeration;
import java.util.Set;
import java.util.HashSet;
import java.util.*;
/** An IndexReader which reads multiple, parallel indexes. Each index added
@ -426,6 +416,12 @@ public class ParallelReader extends IndexReader {
public byte[] getPayload(byte[] data, int offset) throws IOException {
return ((TermPositions)termDocs).getPayload(data, offset);
}
// TODO: Remove warning after API has been finalized
public boolean isPayloadAvailable() {
return ((TermPositions) termDocs).isPayloadAvailable();
}
}
}

View File

@ -17,10 +17,10 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.IndexInput;
import java.io.IOException;
final class SegmentTermPositions
extends SegmentTermDocs implements TermPositions {
private IndexInput proxStream;
@ -189,4 +189,9 @@ extends SegmentTermDocs implements TermPositions {
return retArray;
}
// TODO: Remove warning after API has been finalized
public boolean isPayloadAvailable() {
return needToLoadPayload && payloadLength > 0;
}
}

View File

@ -81,4 +81,20 @@ public interface TermPositions
*/
// TODO: Remove warning after API has been finalized
byte[] getPayload(byte[] data, int offset) throws IOException;
/**
* Can we load the payload at this position? Payloads can only be loaded once per call
* to {@link #nextPosition()}
* @return true if there is a payload available at this position that can be loaded
*
* * <b>
* Warning: The status of the Payloads feature is experimental. The APIs
* introduced here might change in the future and will not be supported anymore
* in such a case. If you want to use this feature in a production environment
* you should wait for an official release.
* </b>
*/
// TODO: Remove warning after API has been finalized
public boolean isPayloadAvailable();
}

View File

@ -17,16 +17,16 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.SmallFloat;
import java.io.IOException;
import java.io.Serializable;
import java.util.Collection;
import java.util.Iterator;
/** Expert: Scoring API.
* <p>Subclasses implement search scoring.
*
@ -503,4 +503,28 @@ public abstract class Similarity implements Serializable {
* @return a score factor based on term overlap with the query
*/
public abstract float coord(int overlap, int maxOverlap);
/**
* Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
* what is in the byte array.
* <p>
* The default implementation returns 1.
*
* @param payload The payload byte array to be scored
* @return An implementation dependent float to be used as a scoring factor
* <b>
* Warning: The status of the Payloads feature is experimental. The APIs
* introduced here might change in the future and will not be supported anymore
* in such a case. If you want to use this feature in a production environment
* you should wait for an official release.
* </b>
*/
// TODO: Remove warning after API has been finalized
public float scorePayload(byte [] payload, int offset, int length)
{
//Do nothing
return 1;
}
}

View File

@ -0,0 +1,153 @@
package org.apache.lucene.search.payloads;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.TermSpans;
import java.io.IOException;
/**
* Copyright 2004 The Apache Software Foundation
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* The BoostingTermQuery is very similar to the {@link org.apache.lucene.search.spans.SpanTermQuery} except
* that it factors in the value of the payload located at each of the positions where the
* {@link org.apache.lucene.index.Term} occurs.
* <p>
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
* which returns 1 by default.
*
*
* @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
*/
public class BoostingTermQuery extends SpanTermQuery{
public BoostingTermQuery(Term term) {
super(term);
}
protected Weight createWeight(Searcher searcher) throws IOException {
return new BoostingTermWeight(this, searcher);
}
private class BoostingTermWeight extends SpanWeight implements Weight {
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
super(query, searcher);
}
public Scorer scorer(IndexReader reader) throws IOException {
return new BoostingSpanScorer((TermSpans)query.getSpans(reader), this, similarity,
reader.norms(query.getField()));
}
class BoostingSpanScorer extends SpanScorer {
//TODO: is this the best way to allocate this?
byte[] payload = new byte[256];
private TermPositions positions;
public BoostingSpanScorer(TermSpans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException {
super(spans, weight, similarity, norms);
positions = spans.getPositions();
}
public boolean next() throws IOException {
boolean result = super.next();
//set the payload. super.next() properly increments the term positions
if (result) {
loadPayload();
}
return result;
}
public boolean skipTo(int target) throws IOException {
boolean result = super.skipTo(target);
if (result) {
loadPayload();
}
return result;
}
private void loadPayload() throws IOException {
if (positions.isPayloadAvailable()) {
payload = positions.getPayload(payload, 0);
} else {
//zero out the payload?
}
}
public float score() throws IOException {
int payLength = positions.getPayloadLength();
return super.score() * (payLength > 0 ? getSimilarity().scorePayload(payload, 0, payLength) : 1);
}
public Explanation explain(final int doc) throws IOException {
Explanation result = new Explanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
//QUESTION: Is there a wau to avoid this skipTo call? We need to know whether to load the payload or not
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
/*
if (skipTo(doc) == true) {
loadPayload();
}
*/
float payloadScore = getSimilarity().scorePayload(payload, 0, positions.getPayloadLength());
payloadBoost.setValue(payloadScore);
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * payloadScore);
result.setDescription("btq");
return result;
}
}
}
public boolean equals(Object o) {
if (!(o instanceof BoostingTermQuery))
return false;
BoostingTermQuery other = (BoostingTermQuery) o;
return (this.getBoost() == other.getBoost())
&& this.term.equals(other.term);
}
}

View File

@ -0,0 +1,36 @@
<HTML>
<!--
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
--><HEAD>
<TITLE>org.apache.lucene.search.payloads</TITLE>
</HEAD>
<BODY>
<DIV>The payloads package provides Query mechanisms for finding and using payloads.
The following Query implementations are provided:
</DIV>
<div>
<ol>
<li><a href="./BoostingTermQuery.html">BoostingTermQuery</a> -- Boost a term's score based on the value of the payload located at that term</li>
</ol>
</div>
<DIV>&nbsp;</DIV>
<DIV align="center">
</DIV>
</BODY>
</HTML>

View File

@ -17,27 +17,29 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import java.io.IOException;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Similarity;
/**
* Public for extension only.
*/
public class SpanScorer extends Scorer {
protected Spans spans;
protected Weight weight;
protected byte[] norms;
protected float value;
protected boolean firstTime = true;
protected boolean more = true;
class SpanScorer extends Scorer {
private Spans spans;
private Weight weight;
private byte[] norms;
private float value;
protected int doc;
protected float freq;
private boolean firstTime = true;
private boolean more = true;
private int doc;
private float freq;
SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
throws IOException {
super(similarity);
this.spans = spans;

View File

@ -17,20 +17,18 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Set;
/** Matches spans containing a term. */
public class SpanTermQuery extends SpanQuery {
private Term term;
protected Term term;
/** Construct a SpanTermQuery matching the named term's spans. */
public SpanTermQuery(Term term) { this.term = term; }
@ -78,60 +76,7 @@ public class SpanTermQuery extends SpanQuery {
}
public Spans getSpans(final IndexReader reader) throws IOException {
return new Spans() {
private TermPositions positions = reader.termPositions(term);
private int doc = -1;
private int freq;
private int count;
private int position;
public boolean next() throws IOException {
if (count == freq) {
if (!positions.next()) {
doc = Integer.MAX_VALUE;
return false;
}
doc = positions.doc();
freq = positions.freq();
count = 0;
}
position = positions.nextPosition();
count++;
return true;
}
public boolean skipTo(int target) throws IOException {
// are we already at the correct position?
if (doc >= target) {
return true;
}
if (!positions.skipTo(target)) {
doc = Integer.MAX_VALUE;
return false;
}
doc = positions.doc();
freq = positions.freq();
count = 0;
position = positions.nextPosition();
count++;
return true;
}
public int doc() { return doc; }
public int start() { return position; }
public int end() { return position + 1; }
public String toString() {
return "spans(" + SpanTermQuery.this.toString() + ")@"+
(doc==-1?"START":(doc==Integer.MAX_VALUE)?"END":doc+"-"+position);
}
};
return new TermSpans(reader.termPositions(term), term);
}
}

View File

@ -17,32 +17,27 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
/**
* Expert-only. Public for use by other weight implementations
*/
public class SpanWeight implements Weight {
protected Similarity similarity;
protected float value;
protected float idf;
protected float queryNorm;
protected float queryWeight;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Similarity;
class SpanWeight implements Weight {
private Similarity similarity;
private float value;
private float idf;
private float queryNorm;
private float queryWeight;
private Set terms;
private SpanQuery query;
protected Set terms;
protected SpanQuery query;
public SpanWeight(SpanQuery query, Searcher searcher)
throws IOException {

View File

@ -0,0 +1,101 @@
package org.apache.lucene.search.spans;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import java.io.IOException;
/**
* Expert:
* Public for extension only
*/
public class TermSpans implements Spans {
protected TermPositions positions;
protected Term term;
protected int doc;
protected int freq;
protected int count;
protected int position;
public TermSpans(TermPositions positions, Term term) throws IOException {
this.positions = positions;
this.term = term;
doc = -1;
}
public boolean next() throws IOException {
if (count == freq) {
if (!positions.next()) {
doc = Integer.MAX_VALUE;
return false;
}
doc = positions.doc();
freq = positions.freq();
count = 0;
}
position = positions.nextPosition();
count++;
return true;
}
public boolean skipTo(int target) throws IOException {
// are we already at the correct position?
if (doc >= target) {
return true;
}
if (!positions.skipTo(target)) {
doc = Integer.MAX_VALUE;
return false;
}
doc = positions.doc();
freq = positions.freq();
count = 0;
position = positions.nextPosition();
count++;
return true;
}
public int doc() {
return doc;
}
public int start() {
return position;
}
public int end() {
return position + 1;
}
public String toString() {
return "spans(" + term.toString() + ")@" +
(doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position);
}
public TermPositions getPositions() {
return positions;
}
}

View File

@ -0,0 +1,188 @@
package org.apache.lucene.search.payloads;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.English;
import java.io.IOException;
import java.io.Reader;
public class TestBoostingTermQuery extends TestCase {
private IndexSearcher searcher;
private BoostingSimilarity similarity = new BoostingSimilarity();
public TestBoostingTermQuery(String s) {
super(s);
}
private class PayloadAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new LowerCaseTokenizer(reader);
result = new PayloadFilter(result);
return result;
}
}
private class PayloadFilter extends TokenFilter {
public PayloadFilter(TokenStream input) {
super(input);
}
public Token next() throws IOException {
Token result = input.next();
if (result != null) {
result.setPayload(new Payload(encodePayload(result.termText()), 0, 4));
}
return result;
}
}
protected void setUp() throws IOException {
RAMDirectory directory = new RAMDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer
= new IndexWriter(directory, analyzer, true);
writer.setSimilarity(similarity);
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
writer.addDocument(doc);
}
//writer.optimize();
writer.close();
searcher = new IndexSearcher(directory);
searcher.setSimilarity(similarity);
}
private byte[] encodePayload(String englishInt)
{
int i = englishInt.hashCode();
byte[] bytes = new byte[4];
bytes[0] = (byte) (i >>> 24);
bytes[1] = (byte) (i >>> 16);
bytes[2] = (byte) (i >>> 8);
bytes[3] = (byte) i;
return bytes;
}
private int decodePayload(byte[] payload, int size)
{
//This should be equal to the hash code of the String representing the English int from English.intToEnglish
int result = (payload[0] << 24) | (payload[1] << 16) | (payload[2] << 8) | (payload[3]);
/*assertEquals((byte) (size >>> 24), payload[0]);
assertEquals((byte) (size >>> 16), payload[1]);
assertEquals((byte) (size >>> 8), payload[2]);
assertEquals((byte) size, payload[3]);*/
return result;
}
protected void tearDown() {
}
public void test() throws IOException {
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "seventy"));
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
//they should all have the exact same score, because they all contain seventy once, and we set
//all the other similarity factors to be 1
//This score should be 1, since we normalize scores
int seventyHash = "seventy".hashCode();
assertTrue("score " + hits.getMaxScore() + " does not equal 'seventy' hashcode: " + seventyHash, hits.getMaxScore() == seventyHash);
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc doc = hits.scoreDocs[i];
assertTrue("score " + doc.score + " does not equal 'seventy' hashcode: " + seventyHash, doc.score == seventyHash);
}
CheckHits.checkExplanations(query, "field", searcher);
Spans spans = query.getSpans(searcher.getIndexReader());
assertTrue("spans is null and it shouldn't be", spans != null);
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
/*float score = hits.score(0);
for (int i =1; i < hits.length(); i++)
{
assertTrue("scores are not equal and they should be", score == hits.score(i));
}*/
}
public void testNoMatch() throws Exception {
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk"));
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 0, hits.totalHits == 0);
}
class BoostingSimilarity extends DefaultSimilarity
{
// TODO: Remove warning after API has been finalized
public float scorePayload(byte[] payload, int offset, int length) {
//we know it is size 4 here, so ignore the offset/length
return decodePayload(payload,4);
}
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
//Make everything else 1 so we see the effect of the payload
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
public float lengthNorm(String fieldName, int numTerms) {
return 1;
}
public float queryNorm(float sumOfSquaredWeights) {
return 1;
}
public float sloppyFreq(int distance) {
return 1;
}
public float coord(int overlap, int maxOverlap) {
return 1;
}
public float idf(int docFreq, int numDocs) {
return 1;
}
public float tf(float freq) {
return 1;
}
}
}