mirror of https://github.com/apache/lucene.git
LUCENE-834:
Added in payloads search package, with one Query implementation: BoostingTermQuery. Added isPayloadAvailable() method to TermPositions and implementations. Modified access rights to some of the spans classes so that they could be accessed from the payloads package. All tests pass. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@523302 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4f1d6582d7
commit
b591bd8efb
|
@ -34,6 +34,9 @@ API Changes
|
|||
throw AlreadyClosedException if they are accessed after being
|
||||
closed. (Mike McCandless)
|
||||
|
||||
5. LUCENE-834: Changed some access levels for certain Span classes to allow them to be overridden. They have
|
||||
been marked expert only and not for public consumption. (Grant Ingersoll)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist. (Doron Cohen)
|
||||
|
@ -101,6 +104,12 @@ New features
|
|||
contain appropriate warnings in the javadocs.
|
||||
(Michael Busch)
|
||||
|
||||
4. LUCENE-834: Added BoostingTermQuery which can boost scores based on the values of a payload (see #3 above.) (Grant Ingersoll)
|
||||
5. LUCENE-834: Similarity has a new method for scoring payloads called scorePayloads that can be overridden to take advantage
|
||||
of payload storage (see #3 above)
|
||||
6. LUCENE-834: Added isPayloadAvailable() onto TermPositions interface and implemented it in the appropriate places (Grant Ingersoll)
|
||||
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.index;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
||||
|
@ -70,6 +69,12 @@ public class FilterIndexReader extends IndexReader {
|
|||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
return ((TermPositions) this.in).getPayload(data, offset);
|
||||
}
|
||||
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public boolean isPayloadAvailable() {
|
||||
return ((TermPositions)this.in).isPayloadAvailable();
|
||||
}
|
||||
}
|
||||
|
||||
/** Base class for filtering {@link TermEnum} implementations. */
|
||||
|
|
|
@ -463,4 +463,10 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions {
|
|||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
return ((TermPositions)current).getPayload(data, offset);
|
||||
}
|
||||
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public boolean isPayloadAvailable() {
|
||||
return ((TermPositions) current).isPayloadAvailable();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,14 +17,14 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Describe class <code>MultipleTermPositions</code> here.
|
||||
*
|
||||
|
@ -209,5 +209,12 @@ public class MultipleTermPositions implements TermPositions {
|
|||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @return false
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public boolean isPayloadAvailable() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,22 +18,12 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.document.FieldSelector;
|
||||
import org.apache.lucene.document.FieldSelectorResult;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.SortedMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/** An IndexReader which reads multiple, parallel indexes. Each index added
|
||||
|
@ -426,6 +416,12 @@ public class ParallelReader extends IndexReader {
|
|||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||
return ((TermPositions)termDocs).getPayload(data, offset);
|
||||
}
|
||||
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public boolean isPayloadAvailable() {
|
||||
return ((TermPositions) termDocs).isPayloadAvailable();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,10 +17,10 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
final class SegmentTermPositions
|
||||
extends SegmentTermDocs implements TermPositions {
|
||||
private IndexInput proxStream;
|
||||
|
@ -189,4 +189,9 @@ extends SegmentTermDocs implements TermPositions {
|
|||
return retArray;
|
||||
}
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public boolean isPayloadAvailable() {
|
||||
return needToLoadPayload && payloadLength > 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -81,4 +81,20 @@ public interface TermPositions
|
|||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
byte[] getPayload(byte[] data, int offset) throws IOException;
|
||||
|
||||
/**
|
||||
* Can we load the payload at this position? Payloads can only be loaded once per call
|
||||
* to {@link #nextPosition()}
|
||||
* @return true if there is a payload available at this position that can be loaded
|
||||
*
|
||||
* * <b>
|
||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||
* introduced here might change in the future and will not be supported anymore
|
||||
* in such a case. If you want to use this feature in a production environment
|
||||
* you should wait for an official release.
|
||||
* </b>
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public boolean isPayloadAvailable();
|
||||
|
||||
}
|
||||
|
|
|
@ -17,16 +17,16 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
|
||||
/** Expert: Scoring API.
|
||||
* <p>Subclasses implement search scoring.
|
||||
*
|
||||
|
@ -503,4 +503,28 @@ public abstract class Similarity implements Serializable {
|
|||
* @return a score factor based on term overlap with the query
|
||||
*/
|
||||
public abstract float coord(int overlap, int maxOverlap);
|
||||
|
||||
|
||||
/**
|
||||
* Calculate a scoring factor based on the data in the payload. Overriding implementations
|
||||
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
|
||||
* what is in the byte array.
|
||||
* <p>
|
||||
* The default implementation returns 1.
|
||||
*
|
||||
* @param payload The payload byte array to be scored
|
||||
* @return An implementation dependent float to be used as a scoring factor
|
||||
* <b>
|
||||
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||
* introduced here might change in the future and will not be supported anymore
|
||||
* in such a case. If you want to use this feature in a production environment
|
||||
* you should wait for an official release.
|
||||
* </b>
|
||||
*/
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public float scorePayload(byte [] payload, int offset, int length)
|
||||
{
|
||||
//Do nothing
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.spans.SpanScorer;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
import org.apache.lucene.search.spans.TermSpans;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
* <p/>
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* <p/>
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* <p/>
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* The BoostingTermQuery is very similar to the {@link org.apache.lucene.search.spans.SpanTermQuery} except
|
||||
* that it factors in the value of the payload located at each of the positions where the
|
||||
* {@link org.apache.lucene.index.Term} occurs.
|
||||
* <p>
|
||||
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
|
||||
* which returns 1 by default.
|
||||
*
|
||||
*
|
||||
* @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
|
||||
*/
|
||||
public class BoostingTermQuery extends SpanTermQuery{
|
||||
|
||||
|
||||
public BoostingTermQuery(Term term) {
|
||||
super(term);
|
||||
}
|
||||
|
||||
|
||||
protected Weight createWeight(Searcher searcher) throws IOException {
|
||||
return new BoostingTermWeight(this, searcher);
|
||||
}
|
||||
|
||||
private class BoostingTermWeight extends SpanWeight implements Weight {
|
||||
|
||||
|
||||
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
|
||||
super(query, searcher);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public Scorer scorer(IndexReader reader) throws IOException {
|
||||
return new BoostingSpanScorer((TermSpans)query.getSpans(reader), this, similarity,
|
||||
reader.norms(query.getField()));
|
||||
}
|
||||
|
||||
class BoostingSpanScorer extends SpanScorer {
|
||||
|
||||
//TODO: is this the best way to allocate this?
|
||||
byte[] payload = new byte[256];
|
||||
private TermPositions positions;
|
||||
|
||||
|
||||
public BoostingSpanScorer(TermSpans spans, Weight weight,
|
||||
Similarity similarity, byte[] norms) throws IOException {
|
||||
super(spans, weight, similarity, norms);
|
||||
positions = spans.getPositions();
|
||||
|
||||
}
|
||||
|
||||
public boolean next() throws IOException {
|
||||
|
||||
boolean result = super.next();
|
||||
//set the payload. super.next() properly increments the term positions
|
||||
if (result) {
|
||||
loadPayload();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
boolean result = super.skipTo(target);
|
||||
|
||||
if (result) {
|
||||
loadPayload();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private void loadPayload() throws IOException {
|
||||
if (positions.isPayloadAvailable()) {
|
||||
payload = positions.getPayload(payload, 0);
|
||||
|
||||
} else {
|
||||
//zero out the payload?
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public float score() throws IOException {
|
||||
|
||||
int payLength = positions.getPayloadLength();
|
||||
return super.score() * (payLength > 0 ? getSimilarity().scorePayload(payload, 0, payLength) : 1);
|
||||
}
|
||||
|
||||
|
||||
public Explanation explain(final int doc) throws IOException {
|
||||
Explanation result = new Explanation();
|
||||
Explanation nonPayloadExpl = super.explain(doc);
|
||||
result.addDetail(nonPayloadExpl);
|
||||
//QUESTION: Is there a wau to avoid this skipTo call? We need to know whether to load the payload or not
|
||||
|
||||
Explanation payloadBoost = new Explanation();
|
||||
result.addDetail(payloadBoost);
|
||||
/*
|
||||
if (skipTo(doc) == true) {
|
||||
loadPayload();
|
||||
}
|
||||
*/
|
||||
float payloadScore = getSimilarity().scorePayload(payload, 0, positions.getPayloadLength());
|
||||
payloadBoost.setValue(payloadScore);
|
||||
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
|
||||
payloadBoost.setDescription("scorePayload(...)");
|
||||
result.setValue(nonPayloadExpl.getValue() * payloadScore);
|
||||
result.setDescription("btq");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof BoostingTermQuery))
|
||||
return false;
|
||||
BoostingTermQuery other = (BoostingTermQuery) o;
|
||||
return (this.getBoost() == other.getBoost())
|
||||
&& this.term.equals(other.term);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
<HTML>
|
||||
<!--
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
--><HEAD>
|
||||
<TITLE>org.apache.lucene.search.payloads</TITLE>
|
||||
</HEAD>
|
||||
<BODY>
|
||||
<DIV>The payloads package provides Query mechanisms for finding and using payloads.
|
||||
|
||||
The following Query implementations are provided:
|
||||
</DIV>
|
||||
<div>
|
||||
<ol>
|
||||
<li><a href="./BoostingTermQuery.html">BoostingTermQuery</a> -- Boost a term's score based on the value of the payload located at that term</li>
|
||||
</ol>
|
||||
</div>
|
||||
<DIV> </DIV>
|
||||
<DIV align="center">
|
||||
</DIV>
|
||||
</BODY>
|
||||
</HTML>
|
|
@ -17,27 +17,29 @@ package org.apache.lucene.search.spans;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
/**
|
||||
* Public for extension only.
|
||||
*/
|
||||
public class SpanScorer extends Scorer {
|
||||
protected Spans spans;
|
||||
protected Weight weight;
|
||||
protected byte[] norms;
|
||||
protected float value;
|
||||
|
||||
protected boolean firstTime = true;
|
||||
protected boolean more = true;
|
||||
|
||||
class SpanScorer extends Scorer {
|
||||
private Spans spans;
|
||||
private Weight weight;
|
||||
private byte[] norms;
|
||||
private float value;
|
||||
protected int doc;
|
||||
protected float freq;
|
||||
|
||||
private boolean firstTime = true;
|
||||
private boolean more = true;
|
||||
|
||||
private int doc;
|
||||
private float freq;
|
||||
|
||||
SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
|
||||
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
|
||||
throws IOException {
|
||||
super(similarity);
|
||||
this.spans = spans;
|
||||
|
|
|
@ -17,20 +17,18 @@ package org.apache.lucene.search.spans;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
|
||||
/** Matches spans containing a term. */
|
||||
public class SpanTermQuery extends SpanQuery {
|
||||
private Term term;
|
||||
protected Term term;
|
||||
|
||||
/** Construct a SpanTermQuery matching the named term's spans. */
|
||||
public SpanTermQuery(Term term) { this.term = term; }
|
||||
|
@ -78,60 +76,7 @@ public class SpanTermQuery extends SpanQuery {
|
|||
}
|
||||
|
||||
public Spans getSpans(final IndexReader reader) throws IOException {
|
||||
return new Spans() {
|
||||
private TermPositions positions = reader.termPositions(term);
|
||||
|
||||
private int doc = -1;
|
||||
private int freq;
|
||||
private int count;
|
||||
private int position;
|
||||
|
||||
public boolean next() throws IOException {
|
||||
if (count == freq) {
|
||||
if (!positions.next()) {
|
||||
doc = Integer.MAX_VALUE;
|
||||
return false;
|
||||
}
|
||||
doc = positions.doc();
|
||||
freq = positions.freq();
|
||||
count = 0;
|
||||
}
|
||||
position = positions.nextPosition();
|
||||
count++;
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
// are we already at the correct position?
|
||||
if (doc >= target) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!positions.skipTo(target)) {
|
||||
doc = Integer.MAX_VALUE;
|
||||
return false;
|
||||
}
|
||||
|
||||
doc = positions.doc();
|
||||
freq = positions.freq();
|
||||
count = 0;
|
||||
|
||||
position = positions.nextPosition();
|
||||
count++;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public int doc() { return doc; }
|
||||
public int start() { return position; }
|
||||
public int end() { return position + 1; }
|
||||
|
||||
public String toString() {
|
||||
return "spans(" + SpanTermQuery.this.toString() + ")@"+
|
||||
(doc==-1?"START":(doc==Integer.MAX_VALUE)?"END":doc+"-"+position);
|
||||
}
|
||||
|
||||
};
|
||||
return new TermSpans(reader.termPositions(term), term);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,32 +17,27 @@ package org.apache.lucene.search.spans;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
/**
|
||||
* Expert-only. Public for use by other weight implementations
|
||||
*/
|
||||
public class SpanWeight implements Weight {
|
||||
protected Similarity similarity;
|
||||
protected float value;
|
||||
protected float idf;
|
||||
protected float queryNorm;
|
||||
protected float queryWeight;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.ComplexExplanation;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
|
||||
class SpanWeight implements Weight {
|
||||
private Similarity similarity;
|
||||
private float value;
|
||||
private float idf;
|
||||
private float queryNorm;
|
||||
private float queryWeight;
|
||||
|
||||
private Set terms;
|
||||
private SpanQuery query;
|
||||
protected Set terms;
|
||||
protected SpanQuery query;
|
||||
|
||||
public SpanWeight(SpanQuery query, Searcher searcher)
|
||||
throws IOException {
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.search.spans;
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Expert:
|
||||
* Public for extension only
|
||||
*/
|
||||
public class TermSpans implements Spans {
|
||||
protected TermPositions positions;
|
||||
protected Term term;
|
||||
protected int doc;
|
||||
protected int freq;
|
||||
protected int count;
|
||||
protected int position;
|
||||
|
||||
|
||||
public TermSpans(TermPositions positions, Term term) throws IOException {
|
||||
|
||||
this.positions = positions;
|
||||
this.term = term;
|
||||
doc = -1;
|
||||
}
|
||||
|
||||
public boolean next() throws IOException {
|
||||
if (count == freq) {
|
||||
if (!positions.next()) {
|
||||
doc = Integer.MAX_VALUE;
|
||||
return false;
|
||||
}
|
||||
doc = positions.doc();
|
||||
freq = positions.freq();
|
||||
count = 0;
|
||||
}
|
||||
position = positions.nextPosition();
|
||||
count++;
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
// are we already at the correct position?
|
||||
if (doc >= target) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!positions.skipTo(target)) {
|
||||
doc = Integer.MAX_VALUE;
|
||||
return false;
|
||||
}
|
||||
|
||||
doc = positions.doc();
|
||||
freq = positions.freq();
|
||||
count = 0;
|
||||
|
||||
position = positions.nextPosition();
|
||||
count++;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public int doc() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
public int start() {
|
||||
return position;
|
||||
}
|
||||
|
||||
public int end() {
|
||||
return position + 1;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "spans(" + term.toString() + ")@" +
|
||||
(doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position);
|
||||
}
|
||||
|
||||
|
||||
public TermPositions getPositions() {
|
||||
return positions;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
import org.apache.lucene.search.spans.TermSpans;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.English;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
public class TestBoostingTermQuery extends TestCase {
|
||||
private IndexSearcher searcher;
|
||||
private BoostingSimilarity similarity = new BoostingSimilarity();
|
||||
|
||||
public TestBoostingTermQuery(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
private class PayloadAnalyzer extends Analyzer {
|
||||
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new LowerCaseTokenizer(reader);
|
||||
result = new PayloadFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
private class PayloadFilter extends TokenFilter {
|
||||
|
||||
|
||||
public PayloadFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token result = input.next();
|
||||
if (result != null) {
|
||||
result.setPayload(new Payload(encodePayload(result.termText()), 0, 4));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
protected void setUp() throws IOException {
|
||||
RAMDirectory directory = new RAMDirectory();
|
||||
PayloadAnalyzer analyzer = new PayloadAnalyzer();
|
||||
IndexWriter writer
|
||||
= new IndexWriter(directory, analyzer, true);
|
||||
writer.setSimilarity(similarity);
|
||||
//writer.infoStream = System.out;
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
//writer.optimize();
|
||||
writer.close();
|
||||
|
||||
searcher = new IndexSearcher(directory);
|
||||
searcher.setSimilarity(similarity);
|
||||
}
|
||||
|
||||
private byte[] encodePayload(String englishInt)
|
||||
{
|
||||
int i = englishInt.hashCode();
|
||||
byte[] bytes = new byte[4];
|
||||
bytes[0] = (byte) (i >>> 24);
|
||||
bytes[1] = (byte) (i >>> 16);
|
||||
bytes[2] = (byte) (i >>> 8);
|
||||
bytes[3] = (byte) i;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
private int decodePayload(byte[] payload, int size)
|
||||
{
|
||||
//This should be equal to the hash code of the String representing the English int from English.intToEnglish
|
||||
int result = (payload[0] << 24) | (payload[1] << 16) | (payload[2] << 8) | (payload[3]);
|
||||
|
||||
/*assertEquals((byte) (size >>> 24), payload[0]);
|
||||
assertEquals((byte) (size >>> 16), payload[1]);
|
||||
assertEquals((byte) (size >>> 8), payload[2]);
|
||||
assertEquals((byte) size, payload[3]);*/
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() throws IOException {
|
||||
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "seventy"));
|
||||
TopDocs hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
|
||||
|
||||
//they should all have the exact same score, because they all contain seventy once, and we set
|
||||
//all the other similarity factors to be 1
|
||||
//This score should be 1, since we normalize scores
|
||||
int seventyHash = "seventy".hashCode();
|
||||
assertTrue("score " + hits.getMaxScore() + " does not equal 'seventy' hashcode: " + seventyHash, hits.getMaxScore() == seventyHash);
|
||||
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||
ScoreDoc doc = hits.scoreDocs[i];
|
||||
assertTrue("score " + doc.score + " does not equal 'seventy' hashcode: " + seventyHash, doc.score == seventyHash);
|
||||
}
|
||||
CheckHits.checkExplanations(query, "field", searcher);
|
||||
Spans spans = query.getSpans(searcher.getIndexReader());
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
|
||||
/*float score = hits.score(0);
|
||||
for (int i =1; i < hits.length(); i++)
|
||||
{
|
||||
assertTrue("scores are not equal and they should be", score == hits.score(i));
|
||||
}*/
|
||||
|
||||
}
|
||||
|
||||
public void testNoMatch() throws Exception {
|
||||
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk"));
|
||||
TopDocs hits = searcher.search(query, null, 100);
|
||||
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||
assertTrue("hits Size: " + hits.totalHits + " is not: " + 0, hits.totalHits == 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
class BoostingSimilarity extends DefaultSimilarity
|
||||
{
|
||||
|
||||
// TODO: Remove warning after API has been finalized
|
||||
public float scorePayload(byte[] payload, int offset, int length) {
|
||||
//we know it is size 4 here, so ignore the offset/length
|
||||
return decodePayload(payload,4);
|
||||
}
|
||||
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
//Make everything else 1 so we see the effect of the payload
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
public float lengthNorm(String fieldName, int numTerms) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float queryNorm(float sumOfSquaredWeights) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float sloppyFreq(int distance) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float coord(int overlap, int maxOverlap) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float idf(int docFreq, int numDocs) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public float tf(float freq) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue