LUCENE-834:

Added documentation to Similarity class

Changed BoostingTermQuery to score all occurrences of the term in a document by making the underlying setFreqCurrentDoc() method protected so that it can be overridden in BTQ.  Updated the tests to have multiple terms per document with different payloads, so that it can be averaged.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@531913 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-04-24 12:49:42 +00:00
parent e06188c0d6
commit a181157bfd
4 changed files with 120 additions and 43 deletions

View File

@ -513,6 +513,8 @@ public abstract class Similarity implements Serializable {
* The default implementation returns 1.
*
* @param payload The payload byte array to be scored
* @param offset The offset into the payload array
* @param length The length in the array
* @return An implementation dependent float to be used as a scoring factor
* <b>
* Warning: The status of the Payloads feature is experimental. The APIs

View File

@ -34,6 +34,8 @@ import java.io.IOException;
* <p>
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
* which returns 1 by default.
* <p>
* Payload scores are averaged across term occurrences in the document.
*
*
* @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
@ -50,7 +52,7 @@ public class BoostingTermQuery extends SpanTermQuery{
return new BoostingTermWeight(this, searcher);
}
private class BoostingTermWeight extends SpanWeight implements Weight {
protected class BoostingTermWeight extends SpanWeight implements Weight {
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
@ -70,7 +72,8 @@ public class BoostingTermQuery extends SpanTermQuery{
//TODO: is this the best way to allocate this?
byte[] payload = new byte[256];
private TermPositions positions;
protected float payloadScore;
private int payloadsSeen;
public BoostingSpanScorer(TermSpans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException {
@ -79,12 +82,17 @@ public class BoostingTermQuery extends SpanTermQuery{
}
public boolean next() throws IOException {
/**
* Go to the next document
*
*/
/*public boolean next() throws IOException {
boolean result = super.next();
//set the payload. super.next() properly increments the term positions
if (result) {
loadPayload();
//Load the payloads for all
processPayload();
}
return result;
@ -94,15 +102,38 @@ public class BoostingTermQuery extends SpanTermQuery{
boolean result = super.skipTo(target);
if (result) {
loadPayload();
processPayload();
}
return result;
}*/
protected boolean setFreqCurrentDoc() throws IOException {
if (!more) {
return false;
}
doc = spans.doc();
freq = 0.0f;
payloadScore = 0;
payloadsSeen = 0;
Similarity similarity1 = getSimilarity();
while (more && doc == spans.doc()) {
int matchLength = spans.end() - spans.start();
freq += similarity1.sloppyFreq(matchLength);
processPayload(similarity1);
more = spans.next();//this moves positions to the next match in this document
}
return more || (freq != 0);
}
private void loadPayload() throws IOException {
protected void processPayload(Similarity similarity) throws IOException {
if (positions.isPayloadAvailable()) {
payload = positions.getPayload(payload, 0);
payloadScore += similarity.scorePayload(payload, 0, positions.getPayloadLength());
payloadsSeen++;
} else {
//zero out the payload?
@ -112,8 +143,7 @@ public class BoostingTermQuery extends SpanTermQuery{
public float score() throws IOException {
int payLength = positions.getPayloadLength();
return super.score() * (payLength > 0 ? getSimilarity().scorePayload(payload, 0, payLength) : 1);
return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
}
@ -127,14 +157,15 @@ public class BoostingTermQuery extends SpanTermQuery{
result.addDetail(payloadBoost);
/*
if (skipTo(doc) == true) {
loadPayload();
processPayload();
}
*/
float payloadScore = getSimilarity().scorePayload(payload, 0, positions.getPayloadLength());
payloadBoost.setValue(payloadScore);
float avgPayloadScore = payloadScore / payloadsSeen;
payloadBoost.setValue(avgPayloadScore);
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * payloadScore);
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
result.setDescription("btq");
return result;
}

View File

@ -71,7 +71,7 @@ public class SpanScorer extends Scorer {
return setFreqCurrentDoc();
}
private boolean setFreqCurrentDoc() throws IOException {
protected boolean setFreqCurrentDoc() throws IOException {
if (! more) {
return false;
}

View File

@ -35,6 +35,9 @@ import java.io.Reader;
public class TestBoostingTermQuery extends TestCase {
private IndexSearcher searcher;
private BoostingSimilarity similarity = new BoostingSimilarity();
private byte[] payloadField = new byte[]{1};
private byte[] payloadMultiField1 = new byte[]{2};
private byte[] payloadMultiField2 = new byte[]{4};
public TestBoostingTermQuery(String s) {
super(s);
@ -43,24 +46,43 @@ public class TestBoostingTermQuery extends TestCase {
private class PayloadAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new LowerCaseTokenizer(reader);
result = new PayloadFilter(result);
result = new PayloadFilter(result, fieldName);
return result;
}
}
private class PayloadFilter extends TokenFilter {
String fieldName;
int numSeen = 0;
public PayloadFilter(TokenStream input) {
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
this.fieldName = fieldName;
}
public Token next() throws IOException {
Token result = input.next();
if (result != null) {
result.setPayload(new Payload(encodePayload(result.termText()), 0, 4));
if (fieldName.equals("field"))
{
result.setPayload(new Payload(payloadField));
}
else if (fieldName.equals("multiField"))
{
if (numSeen % 2 == 0)
{
result.setPayload(new Payload(payloadMultiField1));
}
else
{
result.setPayload(new Payload(payloadMultiField2));
}
numSeen++;
}
}
return result;
}
@ -76,6 +98,7 @@ public class TestBoostingTermQuery extends TestCase {
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
doc.add(new Field("multiField", English.intToEnglish(i) + " " + English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
writer.addDocument(doc);
}
//writer.optimize();
@ -85,29 +108,9 @@ public class TestBoostingTermQuery extends TestCase {
searcher.setSimilarity(similarity);
}
private byte[] encodePayload(String englishInt)
{
int i = englishInt.hashCode();
byte[] bytes = new byte[4];
bytes[0] = (byte) (i >>> 24);
bytes[1] = (byte) (i >>> 16);
bytes[2] = (byte) (i >>> 8);
bytes[3] = (byte) i;
return bytes;
}
private int decodePayload(byte[] payload, int size)
{
//This should be equal to the hash code of the String representing the English int from English.intToEnglish
int result = (payload[0] << 24) | (payload[1] << 16) | (payload[2] << 8) | (payload[3]);
/*assertEquals((byte) (size >>> 24), payload[0]);
assertEquals((byte) (size >>> 16), payload[1]);
assertEquals((byte) (size >>> 8), payload[2]);
assertEquals((byte) size, payload[3]);*/
return result;
}
protected void tearDown() {
@ -121,12 +124,11 @@ public class TestBoostingTermQuery extends TestCase {
//they should all have the exact same score, because they all contain seventy once, and we set
//all the other similarity factors to be 1
//This score should be 1, since we normalize scores
int seventyHash = "seventy".hashCode();
assertTrue("score " + hits.getMaxScore() + " does not equal 'seventy' hashcode: " + seventyHash, hits.getMaxScore() == seventyHash);
assertTrue(hits.getMaxScore() + " does not equal: " + 1, hits.getMaxScore() == 1);
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc doc = hits.scoreDocs[i];
assertTrue("score " + doc.score + " does not equal 'seventy' hashcode: " + seventyHash, doc.score == seventyHash);
assertTrue(doc.score + " does not equal: " + 1, doc.score == 1);
}
CheckHits.checkExplanations(query, "field", searcher);
Spans spans = query.getSpans(searcher.getIndexReader());
@ -140,6 +142,48 @@ public class TestBoostingTermQuery extends TestCase {
}
public void testMultipleMatchesPerDoc() throws Exception {
BoostingTermQuery query = new BoostingTermQuery(new Term("multiField", "seventy"));
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
//they should all have the exact same score, because they all contain seventy once, and we set
//all the other similarity factors to be 1
//System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
assertTrue(hits.getMaxScore() + " does not equal: " + 3, hits.getMaxScore() == 3);
//there should be exactly 10 items that score a 3, all the rest should score a 2
//The 10 items are: 70 + i*100 where i in [0-9]
int numTens = 0;
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc doc = hits.scoreDocs[i];
if (doc.doc % 10 == 0)
{
numTens++;
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
}
else
{
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
}
}
assertTrue(numTens + " does not equal: " + 10, numTens == 10);
CheckHits.checkExplanations(query, "field", searcher);
Spans spans = query.getSpans(searcher.getIndexReader());
assertTrue("spans is null and it shouldn't be", spans != null);
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
//should be two matches per document
int count = 0;
//100 hits times 2 matches per hit, we should have 200 in count
while (spans.next())
{
count++;
}
assertTrue(count + " does not equal: " + 200, count == 200);
}
public void testNoMatch() throws Exception {
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk"));
TopDocs hits = searcher.search(query, null, 100);
@ -155,7 +199,7 @@ public class TestBoostingTermQuery extends TestCase {
// TODO: Remove warning after API has been finalized
public float scorePayload(byte[] payload, int offset, int length) {
//we know it is size 4 here, so ignore the offset/length
return decodePayload(payload,4);
return payload[0];
}
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!