mirror of https://github.com/apache/lucene.git
LUCENE-834:
Added in payloads search package, with one Query implementation: BoostingTermQuery. Added isPayloadAvailable() method to TermPositions and implementations. Modified access rights to some of the spans classes so that they could be accessed from the payloads package. All tests pass. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@523302 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4f1d6582d7
commit
b591bd8efb
|
@ -34,6 +34,9 @@ API Changes
|
||||||
throw AlreadyClosedException if they are accessed after being
|
throw AlreadyClosedException if they are accessed after being
|
||||||
closed. (Mike McCandless)
|
closed. (Mike McCandless)
|
||||||
|
|
||||||
|
5. LUCENE-834: Changed some access levels for certain Span classes to allow them to be overridden. They have
|
||||||
|
been marked expert only and not for public consumption. (Grant Ingersoll)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist. (Doron Cohen)
|
1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist. (Doron Cohen)
|
||||||
|
@ -101,6 +104,12 @@ New features
|
||||||
contain appropriate warnings in the javadocs.
|
contain appropriate warnings in the javadocs.
|
||||||
(Michael Busch)
|
(Michael Busch)
|
||||||
|
|
||||||
|
4. LUCENE-834: Added BoostingTermQuery which can boost scores based on the values of a payload (see #3 above.) (Grant Ingersoll)
|
||||||
|
5. LUCENE-834: Similarity has a new method for scoring payloads called scorePayloads that can be overridden to take advantage
|
||||||
|
of payload storage (see #3 above)
|
||||||
|
6. LUCENE-834: Added isPayloadAvailable() onto TermPositions interface and implemented it in the appropriate places (Grant Ingersoll)
|
||||||
|
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.index;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldSelector;
|
import org.apache.lucene.document.FieldSelector;
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
||||||
|
@ -70,6 +69,12 @@ public class FilterIndexReader extends IndexReader {
|
||||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
return ((TermPositions) this.in).getPayload(data, offset);
|
return ((TermPositions) this.in).getPayload(data, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public boolean isPayloadAvailable() {
|
||||||
|
return ((TermPositions)this.in).isPayloadAvailable();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Base class for filtering {@link TermEnum} implementations. */
|
/** Base class for filtering {@link TermEnum} implementations. */
|
||||||
|
|
|
@ -463,4 +463,10 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions {
|
||||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
return ((TermPositions)current).getPayload(data, offset);
|
return ((TermPositions)current).getPayload(data, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public boolean isPayloadAvailable() {
|
||||||
|
return ((TermPositions) current).isPayloadAvailable();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,14 +17,14 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Describe class <code>MultipleTermPositions</code> here.
|
* Describe class <code>MultipleTermPositions</code> here.
|
||||||
*
|
*
|
||||||
|
@ -209,5 +209,12 @@ public class MultipleTermPositions implements TermPositions {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return false
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public boolean isPayloadAvailable() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,22 +18,12 @@ package org.apache.lucene.index;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Fieldable;
|
|
||||||
import org.apache.lucene.document.FieldSelector;
|
import org.apache.lucene.document.FieldSelector;
|
||||||
import org.apache.lucene.document.FieldSelectorResult;
|
import org.apache.lucene.document.FieldSelectorResult;
|
||||||
|
import org.apache.lucene.document.Fieldable;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.SortedMap;
|
import java.util.*;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Enumeration;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
|
|
||||||
/** An IndexReader which reads multiple, parallel indexes. Each index added
|
/** An IndexReader which reads multiple, parallel indexes. Each index added
|
||||||
|
@ -426,6 +416,12 @@ public class ParallelReader extends IndexReader {
|
||||||
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
public byte[] getPayload(byte[] data, int offset) throws IOException {
|
||||||
return ((TermPositions)termDocs).getPayload(data, offset);
|
return ((TermPositions)termDocs).getPayload(data, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public boolean isPayloadAvailable() {
|
||||||
|
return ((TermPositions) termDocs).isPayloadAvailable();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,10 +17,10 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
final class SegmentTermPositions
|
final class SegmentTermPositions
|
||||||
extends SegmentTermDocs implements TermPositions {
|
extends SegmentTermDocs implements TermPositions {
|
||||||
private IndexInput proxStream;
|
private IndexInput proxStream;
|
||||||
|
@ -189,4 +189,9 @@ extends SegmentTermDocs implements TermPositions {
|
||||||
return retArray;
|
return retArray;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public boolean isPayloadAvailable() {
|
||||||
|
return needToLoadPayload && payloadLength > 0;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,4 +81,20 @@ public interface TermPositions
|
||||||
*/
|
*/
|
||||||
// TODO: Remove warning after API has been finalized
|
// TODO: Remove warning after API has been finalized
|
||||||
byte[] getPayload(byte[] data, int offset) throws IOException;
|
byte[] getPayload(byte[] data, int offset) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Can we load the payload at this position? Payloads can only be loaded once per call
|
||||||
|
* to {@link #nextPosition()}
|
||||||
|
* @return true if there is a payload available at this position that can be loaded
|
||||||
|
*
|
||||||
|
* * <b>
|
||||||
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
* introduced here might change in the future and will not be supported anymore
|
||||||
|
* in such a case. If you want to use this feature in a production environment
|
||||||
|
* you should wait for an official release.
|
||||||
|
* </b>
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public boolean isPayloadAvailable();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,16 +17,16 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.util.SmallFloat;
|
import org.apache.lucene.util.SmallFloat;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
/** Expert: Scoring API.
|
/** Expert: Scoring API.
|
||||||
* <p>Subclasses implement search scoring.
|
* <p>Subclasses implement search scoring.
|
||||||
*
|
*
|
||||||
|
@ -503,4 +503,28 @@ public abstract class Similarity implements Serializable {
|
||||||
* @return a score factor based on term overlap with the query
|
* @return a score factor based on term overlap with the query
|
||||||
*/
|
*/
|
||||||
public abstract float coord(int overlap, int maxOverlap);
|
public abstract float coord(int overlap, int maxOverlap);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate a scoring factor based on the data in the payload. Overriding implementations
|
||||||
|
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
|
||||||
|
* what is in the byte array.
|
||||||
|
* <p>
|
||||||
|
* The default implementation returns 1.
|
||||||
|
*
|
||||||
|
* @param payload The payload byte array to be scored
|
||||||
|
* @return An implementation dependent float to be used as a scoring factor
|
||||||
|
* <b>
|
||||||
|
* Warning: The status of the Payloads feature is experimental. The APIs
|
||||||
|
* introduced here might change in the future and will not be supported anymore
|
||||||
|
* in such a case. If you want to use this feature in a production environment
|
||||||
|
* you should wait for an official release.
|
||||||
|
* </b>
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public float scorePayload(byte [] payload, int offset, int length)
|
||||||
|
{
|
||||||
|
//Do nothing
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
package org.apache.lucene.search.payloads;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermPositions;
|
||||||
|
import org.apache.lucene.search.*;
|
||||||
|
import org.apache.lucene.search.spans.SpanScorer;
|
||||||
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanWeight;
|
||||||
|
import org.apache.lucene.search.spans.TermSpans;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
* <p/>
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p/>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p/>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The BoostingTermQuery is very similar to the {@link org.apache.lucene.search.spans.SpanTermQuery} except
|
||||||
|
* that it factors in the value of the payload located at each of the positions where the
|
||||||
|
* {@link org.apache.lucene.index.Term} occurs.
|
||||||
|
* <p>
|
||||||
|
* In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)}
|
||||||
|
* which returns 1 by default.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int)
|
||||||
|
*/
|
||||||
|
public class BoostingTermQuery extends SpanTermQuery{
|
||||||
|
|
||||||
|
|
||||||
|
public BoostingTermQuery(Term term) {
|
||||||
|
super(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected Weight createWeight(Searcher searcher) throws IOException {
|
||||||
|
return new BoostingTermWeight(this, searcher);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class BoostingTermWeight extends SpanWeight implements Weight {
|
||||||
|
|
||||||
|
|
||||||
|
public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException {
|
||||||
|
super(query, searcher);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public Scorer scorer(IndexReader reader) throws IOException {
|
||||||
|
return new BoostingSpanScorer((TermSpans)query.getSpans(reader), this, similarity,
|
||||||
|
reader.norms(query.getField()));
|
||||||
|
}
|
||||||
|
|
||||||
|
class BoostingSpanScorer extends SpanScorer {
|
||||||
|
|
||||||
|
//TODO: is this the best way to allocate this?
|
||||||
|
byte[] payload = new byte[256];
|
||||||
|
private TermPositions positions;
|
||||||
|
|
||||||
|
|
||||||
|
public BoostingSpanScorer(TermSpans spans, Weight weight,
|
||||||
|
Similarity similarity, byte[] norms) throws IOException {
|
||||||
|
super(spans, weight, similarity, norms);
|
||||||
|
positions = spans.getPositions();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean next() throws IOException {
|
||||||
|
|
||||||
|
boolean result = super.next();
|
||||||
|
//set the payload. super.next() properly increments the term positions
|
||||||
|
if (result) {
|
||||||
|
loadPayload();
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean skipTo(int target) throws IOException {
|
||||||
|
boolean result = super.skipTo(target);
|
||||||
|
|
||||||
|
if (result) {
|
||||||
|
loadPayload();
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadPayload() throws IOException {
|
||||||
|
if (positions.isPayloadAvailable()) {
|
||||||
|
payload = positions.getPayload(payload, 0);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
//zero out the payload?
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public float score() throws IOException {
|
||||||
|
|
||||||
|
int payLength = positions.getPayloadLength();
|
||||||
|
return super.score() * (payLength > 0 ? getSimilarity().scorePayload(payload, 0, payLength) : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Explanation explain(final int doc) throws IOException {
|
||||||
|
Explanation result = new Explanation();
|
||||||
|
Explanation nonPayloadExpl = super.explain(doc);
|
||||||
|
result.addDetail(nonPayloadExpl);
|
||||||
|
//QUESTION: Is there a wau to avoid this skipTo call? We need to know whether to load the payload or not
|
||||||
|
|
||||||
|
Explanation payloadBoost = new Explanation();
|
||||||
|
result.addDetail(payloadBoost);
|
||||||
|
/*
|
||||||
|
if (skipTo(doc) == true) {
|
||||||
|
loadPayload();
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
float payloadScore = getSimilarity().scorePayload(payload, 0, positions.getPayloadLength());
|
||||||
|
payloadBoost.setValue(payloadScore);
|
||||||
|
//GSI: I suppose we could toString the payload, but I don't think that would be a good idea
|
||||||
|
payloadBoost.setDescription("scorePayload(...)");
|
||||||
|
result.setValue(nonPayloadExpl.getValue() * payloadScore);
|
||||||
|
result.setDescription("btq");
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (!(o instanceof BoostingTermQuery))
|
||||||
|
return false;
|
||||||
|
BoostingTermQuery other = (BoostingTermQuery) o;
|
||||||
|
return (this.getBoost() == other.getBoost())
|
||||||
|
&& this.term.equals(other.term);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
<HTML>
|
||||||
|
<!--
|
||||||
|
/**
|
||||||
|
* Copyright 2005 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
--><HEAD>
|
||||||
|
<TITLE>org.apache.lucene.search.payloads</TITLE>
|
||||||
|
</HEAD>
|
||||||
|
<BODY>
|
||||||
|
<DIV>The payloads package provides Query mechanisms for finding and using payloads.
|
||||||
|
|
||||||
|
The following Query implementations are provided:
|
||||||
|
</DIV>
|
||||||
|
<div>
|
||||||
|
<ol>
|
||||||
|
<li><a href="./BoostingTermQuery.html">BoostingTermQuery</a> -- Boost a term's score based on the value of the payload located at that term</li>
|
||||||
|
</ol>
|
||||||
|
</div>
|
||||||
|
<DIV> </DIV>
|
||||||
|
<DIV align="center">
|
||||||
|
</DIV>
|
||||||
|
</BODY>
|
||||||
|
</HTML>
|
|
@ -17,27 +17,29 @@ package org.apache.lucene.search.spans;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.search.Explanation;
|
||||||
|
import org.apache.lucene.search.Scorer;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.search.Weight;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.search.Weight;
|
/**
|
||||||
import org.apache.lucene.search.Scorer;
|
* Public for extension only.
|
||||||
import org.apache.lucene.search.Explanation;
|
*/
|
||||||
import org.apache.lucene.search.Similarity;
|
public class SpanScorer extends Scorer {
|
||||||
|
protected Spans spans;
|
||||||
|
protected Weight weight;
|
||||||
|
protected byte[] norms;
|
||||||
|
protected float value;
|
||||||
|
|
||||||
|
protected boolean firstTime = true;
|
||||||
|
protected boolean more = true;
|
||||||
|
|
||||||
class SpanScorer extends Scorer {
|
protected int doc;
|
||||||
private Spans spans;
|
protected float freq;
|
||||||
private Weight weight;
|
|
||||||
private byte[] norms;
|
|
||||||
private float value;
|
|
||||||
|
|
||||||
private boolean firstTime = true;
|
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
|
||||||
private boolean more = true;
|
|
||||||
|
|
||||||
private int doc;
|
|
||||||
private float freq;
|
|
||||||
|
|
||||||
SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
|
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(similarity);
|
super(similarity);
|
||||||
this.spans = spans;
|
this.spans = spans;
|
||||||
|
|
|
@ -17,20 +17,18 @@ package org.apache.lucene.search.spans;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermPositions;
|
|
||||||
import org.apache.lucene.util.ToStringUtils;
|
import org.apache.lucene.util.ToStringUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/** Matches spans containing a term. */
|
/** Matches spans containing a term. */
|
||||||
public class SpanTermQuery extends SpanQuery {
|
public class SpanTermQuery extends SpanQuery {
|
||||||
private Term term;
|
protected Term term;
|
||||||
|
|
||||||
/** Construct a SpanTermQuery matching the named term's spans. */
|
/** Construct a SpanTermQuery matching the named term's spans. */
|
||||||
public SpanTermQuery(Term term) { this.term = term; }
|
public SpanTermQuery(Term term) { this.term = term; }
|
||||||
|
@ -78,60 +76,7 @@ public class SpanTermQuery extends SpanQuery {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Spans getSpans(final IndexReader reader) throws IOException {
|
public Spans getSpans(final IndexReader reader) throws IOException {
|
||||||
return new Spans() {
|
return new TermSpans(reader.termPositions(term), term);
|
||||||
private TermPositions positions = reader.termPositions(term);
|
|
||||||
|
|
||||||
private int doc = -1;
|
|
||||||
private int freq;
|
|
||||||
private int count;
|
|
||||||
private int position;
|
|
||||||
|
|
||||||
public boolean next() throws IOException {
|
|
||||||
if (count == freq) {
|
|
||||||
if (!positions.next()) {
|
|
||||||
doc = Integer.MAX_VALUE;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
doc = positions.doc();
|
|
||||||
freq = positions.freq();
|
|
||||||
count = 0;
|
|
||||||
}
|
|
||||||
position = positions.nextPosition();
|
|
||||||
count++;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean skipTo(int target) throws IOException {
|
|
||||||
// are we already at the correct position?
|
|
||||||
if (doc >= target) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!positions.skipTo(target)) {
|
|
||||||
doc = Integer.MAX_VALUE;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
doc = positions.doc();
|
|
||||||
freq = positions.freq();
|
|
||||||
count = 0;
|
|
||||||
|
|
||||||
position = positions.nextPosition();
|
|
||||||
count++;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int doc() { return doc; }
|
|
||||||
public int start() { return position; }
|
|
||||||
public int end() { return position + 1; }
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "spans(" + SpanTermQuery.this.toString() + ")@"+
|
|
||||||
(doc==-1?"START":(doc==Integer.MAX_VALUE)?"END":doc+"-"+position);
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,32 +17,27 @@ package org.apache.lucene.search.spans;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
/**
|
||||||
import org.apache.lucene.index.Term;
|
* Expert-only. Public for use by other weight implementations
|
||||||
|
*/
|
||||||
|
public class SpanWeight implements Weight {
|
||||||
|
protected Similarity similarity;
|
||||||
|
protected float value;
|
||||||
|
protected float idf;
|
||||||
|
protected float queryNorm;
|
||||||
|
protected float queryWeight;
|
||||||
|
|
||||||
import org.apache.lucene.search.Query;
|
protected Set terms;
|
||||||
import org.apache.lucene.search.Weight;
|
protected SpanQuery query;
|
||||||
import org.apache.lucene.search.Searcher;
|
|
||||||
import org.apache.lucene.search.Scorer;
|
|
||||||
import org.apache.lucene.search.Explanation;
|
|
||||||
import org.apache.lucene.search.ComplexExplanation;
|
|
||||||
import org.apache.lucene.search.Similarity;
|
|
||||||
|
|
||||||
class SpanWeight implements Weight {
|
|
||||||
private Similarity similarity;
|
|
||||||
private float value;
|
|
||||||
private float idf;
|
|
||||||
private float queryNorm;
|
|
||||||
private float queryWeight;
|
|
||||||
|
|
||||||
private Set terms;
|
|
||||||
private SpanQuery query;
|
|
||||||
|
|
||||||
public SpanWeight(SpanQuery query, Searcher searcher)
|
public SpanWeight(SpanQuery query, Searcher searcher)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
|
@ -0,0 +1,101 @@
|
||||||
|
package org.apache.lucene.search.spans;
|
||||||
|
/**
|
||||||
|
* Copyright 2005 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermPositions;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expert:
|
||||||
|
* Public for extension only
|
||||||
|
*/
|
||||||
|
public class TermSpans implements Spans {
|
||||||
|
protected TermPositions positions;
|
||||||
|
protected Term term;
|
||||||
|
protected int doc;
|
||||||
|
protected int freq;
|
||||||
|
protected int count;
|
||||||
|
protected int position;
|
||||||
|
|
||||||
|
|
||||||
|
public TermSpans(TermPositions positions, Term term) throws IOException {
|
||||||
|
|
||||||
|
this.positions = positions;
|
||||||
|
this.term = term;
|
||||||
|
doc = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean next() throws IOException {
|
||||||
|
if (count == freq) {
|
||||||
|
if (!positions.next()) {
|
||||||
|
doc = Integer.MAX_VALUE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
doc = positions.doc();
|
||||||
|
freq = positions.freq();
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
position = positions.nextPosition();
|
||||||
|
count++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean skipTo(int target) throws IOException {
|
||||||
|
// are we already at the correct position?
|
||||||
|
if (doc >= target) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!positions.skipTo(target)) {
|
||||||
|
doc = Integer.MAX_VALUE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
doc = positions.doc();
|
||||||
|
freq = positions.freq();
|
||||||
|
count = 0;
|
||||||
|
|
||||||
|
position = positions.nextPosition();
|
||||||
|
count++;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int doc() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int start() {
|
||||||
|
return position;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int end() {
|
||||||
|
return position + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "spans(" + term.toString() + ")@" +
|
||||||
|
(doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TermPositions getPositions() {
|
||||||
|
return positions;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,188 @@
|
||||||
|
package org.apache.lucene.search.payloads;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.*;
|
||||||
|
import org.apache.lucene.search.spans.Spans;
|
||||||
|
import org.apache.lucene.search.spans.TermSpans;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.util.English;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
public class TestBoostingTermQuery extends TestCase {
|
||||||
|
private IndexSearcher searcher;
|
||||||
|
private BoostingSimilarity similarity = new BoostingSimilarity();
|
||||||
|
|
||||||
|
public TestBoostingTermQuery(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class PayloadAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream result = new LowerCaseTokenizer(reader);
|
||||||
|
result = new PayloadFilter(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class PayloadFilter extends TokenFilter {
|
||||||
|
|
||||||
|
|
||||||
|
public PayloadFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Token next() throws IOException {
|
||||||
|
Token result = input.next();
|
||||||
|
if (result != null) {
|
||||||
|
result.setPayload(new Payload(encodePayload(result.termText()), 0, 4));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void setUp() throws IOException {
|
||||||
|
RAMDirectory directory = new RAMDirectory();
|
||||||
|
PayloadAnalyzer analyzer = new PayloadAnalyzer();
|
||||||
|
IndexWriter writer
|
||||||
|
= new IndexWriter(directory, analyzer, true);
|
||||||
|
writer.setSimilarity(similarity);
|
||||||
|
//writer.infoStream = System.out;
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
//writer.optimize();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
searcher = new IndexSearcher(directory);
|
||||||
|
searcher.setSimilarity(similarity);
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] encodePayload(String englishInt)
|
||||||
|
{
|
||||||
|
int i = englishInt.hashCode();
|
||||||
|
byte[] bytes = new byte[4];
|
||||||
|
bytes[0] = (byte) (i >>> 24);
|
||||||
|
bytes[1] = (byte) (i >>> 16);
|
||||||
|
bytes[2] = (byte) (i >>> 8);
|
||||||
|
bytes[3] = (byte) i;
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int decodePayload(byte[] payload, int size)
|
||||||
|
{
|
||||||
|
//This should be equal to the hash code of the String representing the English int from English.intToEnglish
|
||||||
|
int result = (payload[0] << 24) | (payload[1] << 16) | (payload[2] << 8) | (payload[3]);
|
||||||
|
|
||||||
|
/*assertEquals((byte) (size >>> 24), payload[0]);
|
||||||
|
assertEquals((byte) (size >>> 16), payload[1]);
|
||||||
|
assertEquals((byte) (size >>> 8), payload[2]);
|
||||||
|
assertEquals((byte) size, payload[3]);*/
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() throws IOException {
|
||||||
|
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "seventy"));
|
||||||
|
TopDocs hits = searcher.search(query, null, 100);
|
||||||
|
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||||
|
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
|
||||||
|
|
||||||
|
//they should all have the exact same score, because they all contain seventy once, and we set
|
||||||
|
//all the other similarity factors to be 1
|
||||||
|
//This score should be 1, since we normalize scores
|
||||||
|
int seventyHash = "seventy".hashCode();
|
||||||
|
assertTrue("score " + hits.getMaxScore() + " does not equal 'seventy' hashcode: " + seventyHash, hits.getMaxScore() == seventyHash);
|
||||||
|
for (int i = 0; i < hits.scoreDocs.length; i++) {
|
||||||
|
ScoreDoc doc = hits.scoreDocs[i];
|
||||||
|
assertTrue("score " + doc.score + " does not equal 'seventy' hashcode: " + seventyHash, doc.score == seventyHash);
|
||||||
|
}
|
||||||
|
CheckHits.checkExplanations(query, "field", searcher);
|
||||||
|
Spans spans = query.getSpans(searcher.getIndexReader());
|
||||||
|
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||||
|
assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans);
|
||||||
|
/*float score = hits.score(0);
|
||||||
|
for (int i =1; i < hits.length(); i++)
|
||||||
|
{
|
||||||
|
assertTrue("scores are not equal and they should be", score == hits.score(i));
|
||||||
|
}*/
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNoMatch() throws Exception {
|
||||||
|
BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk"));
|
||||||
|
TopDocs hits = searcher.search(query, null, 100);
|
||||||
|
assertTrue("hits is null and it shouldn't be", hits != null);
|
||||||
|
assertTrue("hits Size: " + hits.totalHits + " is not: " + 0, hits.totalHits == 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class BoostingSimilarity extends DefaultSimilarity
|
||||||
|
{
|
||||||
|
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public float scorePayload(byte[] payload, int offset, int length) {
|
||||||
|
//we know it is size 4 here, so ignore the offset/length
|
||||||
|
return decodePayload(payload,4);
|
||||||
|
}
|
||||||
|
|
||||||
|
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
//Make everything else 1 so we see the effect of the payload
|
||||||
|
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||||
|
public float lengthNorm(String fieldName, int numTerms) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float queryNorm(float sumOfSquaredWeights) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float sloppyFreq(int distance) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float coord(int overlap, int maxOverlap) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float idf(int docFreq, int numDocs) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float tf(float freq) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue