mirror of https://github.com/apache/lucene.git
LUCENE-834:
Added documentation to Similarity class Changed BoostingTermQuery to score all occurrences of the term in a document by making the underlying setFreqCurrentDoc() method protected so that it can be overridden in BTQ. Updated the tests to have multiple terms per document with different payloads, so that it can be averaged. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@531913 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e06188c0d6
commit
a181157bfd
|
@ -513,6 +513,8 @@ public abstract class Similarity implements Serializable {
|
||||||
* The default implementation returns 1.
|
* The default implementation returns 1.
|
||||||
*
|
*
|
||||||
* @param payload The payload byte array to be scored
|
* @param payload The payload byte array to be scored
|
||||||
|
* @param offset The offset into the payload array
|
||||||
|
* @param length The length in the array
|
||||||
* @return An implementation dependent float to be used as a scoring factor
|
* @return An implementation dependent float to be used as a scoring factor
|
||||||
* <b>
|
* <b>
|
||||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
|
|
@ -34,6 +34,8 @@ import java.io.IOException;
|
||||||
* <p>
|
* <p>
|
||||||
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
|
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
|
||||||
* which returns 1 by default.
|
* which returns 1 by default.
|
||||||
|
* <p>
|
||||||
|
* Payload scores are averaged across term occurrences in the document.
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
|
* @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
|
||||||
|
@ -50,7 +52,7 @@ public class BoostingTermQuery extends SpanTermQuery{
|
||||||
return new BoostingTermWeight(this, searcher);
|
return new BoostingTermWeight(this, searcher);
|
||||||
}
|
}
|
||||||
|
|
||||||
private class BoostingTermWeight extends SpanWeight implements Weight {
|
protected class BoostingTermWeight extends SpanWeight implements Weight {
|
||||||
|
|
||||||
|
|
||||||
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
|
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
|
||||||
|
@ -70,7 +72,8 @@ public class BoostingTermQuery extends SpanTermQuery{
|
||||||
//TODO: is this the best way to allocate this?
|
//TODO: is this the best way to allocate this?
|
||||||
byte[] payload = new byte[256];
|
byte[] payload = new byte[256];
|
||||||
private TermPositions positions;
|
private TermPositions positions;
|
||||||
|
protected float payloadScore;
|
||||||
|
private int payloadsSeen;
|
||||||
|
|
||||||
public BoostingSpanScorer(TermSpans spans, Weight weight,
|
public BoostingSpanScorer(TermSpans spans, Weight weight,
|
||||||
Similarity similarity, byte[] norms) throws IOException {
|
Similarity similarity, byte[] norms) throws IOException {
|
||||||
|
@ -79,12 +82,17 @@ public class BoostingTermQuery extends SpanTermQuery{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean next() throws IOException {
|
/**
|
||||||
|
* Go to the next document
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
/*public boolean next() throws IOException {
|
||||||
|
|
||||||
boolean result = super.next();
|
boolean result = super.next();
|
||||||
//set the payload. super.next() properly increments the term positions
|
//set the payload. super.next() properly increments the term positions
|
||||||
if (result) {
|
if (result) {
|
||||||
loadPayload();
|
//Load the payloads for all
|
||||||
|
processPayload();
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -94,15 +102,38 @@ public class BoostingTermQuery extends SpanTermQuery{
|
||||||
boolean result = super.skipTo(target);
|
boolean result = super.skipTo(target);
|
||||||
|
|
||||||
if (result) {
|
if (result) {
|
||||||
loadPayload();
|
processPayload();
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
}*/
|
||||||
|
|
||||||
|
protected boolean setFreqCurrentDoc() throws IOException {
|
||||||
|
if (!more) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
doc = spans.doc();
|
||||||
|
freq = 0.0f;
|
||||||
|
payloadScore = 0;
|
||||||
|
payloadsSeen = 0;
|
||||||
|
Similarity similarity1 = getSimilarity();
|
||||||
|
while (more && doc == spans.doc()) {
|
||||||
|
int matchLength = spans.end() - spans.start();
|
||||||
|
|
||||||
|
freq += similarity1.sloppyFreq(matchLength);
|
||||||
|
processPayload(similarity1);
|
||||||
|
|
||||||
|
more = spans.next();//this moves positions to the next match in this document
|
||||||
|
}
|
||||||
|
return more || (freq != 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void loadPayload() throws IOException {
|
|
||||||
|
protected void processPayload(Similarity similarity) throws IOException {
|
||||||
if (positions.isPayloadAvailable()) {
|
if (positions.isPayloadAvailable()) {
|
||||||
payload = positions.getPayload(payload, 0);
|
payload = positions.getPayload(payload, 0);
|
||||||
|
payloadScore += similarity.scorePayload(payload, 0, positions.getPayloadLength());
|
||||||
|
payloadsSeen++;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
//zero out the payload?
|
//zero out the payload?
|
||||||
|
@ -112,8 +143,7 @@ public class BoostingTermQuery extends SpanTermQuery{
|
||||||
|
|
||||||
public float score() throws IOException {
|
public float score() throws IOException {
|
||||||
|
|
||||||
int payLength = positions.getPayloadLength();
|
return super.score() * (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1);
|
||||||
return super.score() * (payLength > 0 ? getSimilarity().scorePayload(payload, 0, payLength) : 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -127,14 +157,15 @@ public class BoostingTermQuery extends SpanTermQuery{
|
||||||
result.addDetail(payloadBoost);
|
result.addDetail(payloadBoost);
|
||||||
/*
|
/*
|
||||||
if (skipTo(doc) == true) {
|
if (skipTo(doc) == true) {
|
||||||
loadPayload();
|
processPayload();
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
float payloadScore = getSimilarity().scorePayload(payload, 0, positions.getPayloadLength());
|
|
||||||
payloadBoost.setValue(payloadScore);
|
float avgPayloadScore = payloadScore / payloadsSeen;
|
||||||
|
payloadBoost.setValue(avgPayloadScore);
|
||||||
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
|
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
|
||||||
payloadBoost.setDescription("scorePayload(...)");
|
payloadBoost.setDescription("scorePayload(...)");
|
||||||
result.setValue(nonPayloadExpl.getValue() * payloadScore);
|
result.setValue(nonPayloadExpl.getValue() * avgPayloadScore);
|
||||||
result.setDescription("btq");
|
result.setDescription("btq");
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -71,7 +71,7 @@ public class SpanScorer extends Scorer {
|
||||||
return setFreqCurrentDoc();
|
return setFreqCurrentDoc();
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean setFreqCurrentDoc() throws IOException {
|
protected boolean setFreqCurrentDoc() throws IOException {
|
||||||
if (! more) {
|
if (! more) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,6 +35,9 @@ import java.io.Reader;
|
||||||
public class TestBoostingTermQuery extends TestCase {
|
public class TestBoostingTermQuery extends TestCase {
|
||||||
private IndexSearcher searcher;
|
private IndexSearcher searcher;
|
||||||
private BoostingSimilarity similarity = new BoostingSimilarity();
|
private BoostingSimilarity similarity = new BoostingSimilarity();
|
||||||
|
private byte[] payloadField = new byte[]{1};
|
||||||
|
private byte[] payloadMultiField1 = new byte[]{2};
|
||||||
|
private byte[] payloadMultiField2 = new byte[]{4};
|
||||||
|
|
||||||
public TestBoostingTermQuery(String s) {
|
public TestBoostingTermQuery(String s) {
|
||||||
super(s);
|
super(s);
|
||||||
|
@ -43,24 +46,43 @@ public class TestBoostingTermQuery extends TestCase {
|
||||||
private class PayloadAnalyzer extends Analyzer {
|
private class PayloadAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new LowerCaseTokenizer(reader);
|
TokenStream result = new LowerCaseTokenizer(reader);
|
||||||
result = new PayloadFilter(result);
|
result = new PayloadFilter(result, fieldName);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class PayloadFilter extends TokenFilter {
|
private class PayloadFilter extends TokenFilter {
|
||||||
|
String fieldName;
|
||||||
|
int numSeen = 0;
|
||||||
|
|
||||||
|
public PayloadFilter(TokenStream input, String fieldName) {
|
||||||
public PayloadFilter(TokenStream input) {
|
|
||||||
super(input);
|
super(input);
|
||||||
|
this.fieldName = fieldName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next() throws IOException {
|
public Token next() throws IOException {
|
||||||
Token result = input.next();
|
Token result = input.next();
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
result.setPayload(new Payload(encodePayload(result.termText()), 0, 4));
|
if (fieldName.equals("field"))
|
||||||
|
{
|
||||||
|
result.setPayload(new Payload(payloadField));
|
||||||
|
}
|
||||||
|
else if (fieldName.equals("multiField"))
|
||||||
|
{
|
||||||
|
if (numSeen % 2 == 0)
|
||||||
|
{
|
||||||
|
result.setPayload(new Payload(payloadMultiField1));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result.setPayload(new Payload(payloadMultiField2));
|
||||||
|
}
|
||||||
|
numSeen++;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -76,6 +98,7 @@ public class TestBoostingTermQuery extends TestCase {
|
||||||
for (int i = 0; i < 1000; i++) {
|
for (int i = 0; i < 1000; i++) {
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
|
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
doc.add(new Field("multiField", English.intToEnglish(i) + " " + English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
}
|
}
|
||||||
//writer.optimize();
|
//writer.optimize();
|
||||||
|
@ -85,29 +108,9 @@ public class TestBoostingTermQuery extends TestCase {
|
||||||
searcher.setSimilarity(similarity);
|
searcher.setSimilarity(similarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
private byte[] encodePayload(String englishInt)
|
|
||||||
{
|
|
||||||
int i = englishInt.hashCode();
|
|
||||||
byte[] bytes = new byte[4];
|
|
||||||
bytes[0] = (byte) (i >>> 24);
|
|
||||||
bytes[1] = (byte) (i >>> 16);
|
|
||||||
bytes[2] = (byte) (i >>> 8);
|
|
||||||
bytes[3] = (byte) i;
|
|
||||||
return bytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int decodePayload(byte[] payload, int size)
|
|
||||||
{
|
|
||||||
//This should be equal to the hash code of the String representing the English int from English.intToEnglish
|
|
||||||
int result = (payload[0] << 24) | (payload[1] << 16) | (payload[2] << 8) | (payload[3]);
|
|
||||||
|
|
||||||
/*assertEquals((byte) (size >>> 24), payload[0]);
|
|
||||||
assertEquals((byte) (size >>> 16), payload[1]);
|
|
||||||
assertEquals((byte) (size >>> 8), payload[2]);
|
|
||||||
assertEquals((byte) size, payload[3]);*/
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void tearDown() {
|
protected void tearDown() {
|
||||||
|
|
||||||
|
@ -121,12 +124,11 @@ public class TestBoostingTermQuery extends TestCase {
|
||||||
|
|
||||||
//they should all have the exact same score, because they all contain seventy once, and we set
|
//they should all have the exact same score, because they all contain seventy once, and we set
|
||||||
//all the other similarity factors to be 1
|
//all the other similarity factors to be 1
|
||||||
//This score should be 1, since we normalize scores
|
|
||||||
int seventyHash = "seventy".hashCode();
|
assertTrue(hits.getMaxScore() + " does not equal: " + 1, hits.getMaxScore() == 1);
|
||||||
assertTrue("score " + hits.getMaxScore() + " does not equal 'seventy' hashcode: " + seventyHash, hits.getMaxScore() == seventyHash);
|
|
||||||
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||||
ScoreDoc doc = hits.scoreDocs[i];
|
ScoreDoc doc = hits.scoreDocs[i];
|
||||||
assertTrue("score " + doc.score + " does not equal 'seventy' hashcode: " + seventyHash, doc.score == seventyHash);
|
assertTrue(doc.score + " does not equal: " + 1, doc.score == 1);
|
||||||
}
|
}
|
||||||
CheckHits.checkExplanations(query, "field", searcher);
|
CheckHits.checkExplanations(query, "field", searcher);
|
||||||
Spans spans = query.getSpans(searcher.getIndexReader());
|
Spans spans = query.getSpans(searcher.getIndexReader());
|
||||||
|
@ -140,6 +142,48 @@ public class TestBoostingTermQuery extends TestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMultipleMatchesPerDoc() throws Exception {
|
||||||
|
BoostingTermQuery query = new BoostingTermQuery(new Term("multiField", "seventy"));
|
||||||
|
TopDocs hits = searcher.search(query, null, 100);
|
||||||
|
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||||
|
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
|
||||||
|
|
||||||
|
|
||||||
|
//they should all have the exact same score, because they all contain seventy once, and we set
|
||||||
|
//all the other similarity factors to be 1
|
||||||
|
|
||||||
|
//System.out.println("Hash: " + seventyHash + " Twice Hash: " + 2*seventyHash);
|
||||||
|
assertTrue(hits.getMaxScore() + " does not equal: " + 3, hits.getMaxScore() == 3);
|
||||||
|
//there should be exactly 10 items that score a 3, all the rest should score a 2
|
||||||
|
//The 10 items are: 70 + i*100 where i in [0-9]
|
||||||
|
int numTens = 0;
|
||||||
|
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||||
|
ScoreDoc doc = hits.scoreDocs[i];
|
||||||
|
if (doc.doc % 10 == 0)
|
||||||
|
{
|
||||||
|
numTens++;
|
||||||
|
assertTrue(doc.score + " does not equal: " + 3, doc.score == 3);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
assertTrue(doc.score + " does not equal: " + 2, doc.score == 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(numTens + " does not equal: " + 10, numTens == 10);
|
||||||
|
CheckHits.checkExplanations(query, "field", searcher);
|
||||||
|
Spans spans = query.getSpans(searcher.getIndexReader());
|
||||||
|
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||||
|
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
|
||||||
|
//should be two matches per document
|
||||||
|
int count = 0;
|
||||||
|
//100 hits times 2 matches per hit, we should have 200 in count
|
||||||
|
while (spans.next())
|
||||||
|
{
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
assertTrue(count + " does not equal: " + 200, count == 200);
|
||||||
|
}
|
||||||
|
|
||||||
public void testNoMatch() throws Exception {
|
public void testNoMatch() throws Exception {
|
||||||
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk"));
|
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk"));
|
||||||
TopDocs hits = searcher.search(query, null, 100);
|
TopDocs hits = searcher.search(query, null, 100);
|
||||||
|
@ -155,7 +199,7 @@ public class TestBoostingTermQuery extends TestCase {
|
||||||
// TODO: Remove warning after API has been finalized
|
// TODO: Remove warning after API has been finalized
|
||||||
public float scorePayload(byte[] payload, int offset, int length) {
|
public float scorePayload(byte[] payload, int offset, int length) {
|
||||||
//we know it is size 4 here, so ignore the offset/length
|
//we know it is size 4 here, so ignore the offset/length
|
||||||
return decodePayload(payload,4);
|
return payload[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
|
Loading…
Reference in New Issue