Added new span-based query API.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150185 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doug Cutting 2004-01-30 22:10:00 +00:00
parent 1df2ba0dec
commit 93ff39de13
16 changed files with 1224 additions and 18 deletions

View File

@ -42,6 +42,9 @@ $Id$
6. Changed FSDirectory to auto-create a full directory tree that it 6. Changed FSDirectory to auto-create a full directory tree that it
needs by using mkdirs() instead of mkdir(). (Mladen Turk via Otis) needs by using mkdirs() instead of mkdir(). (Mladen Turk via Otis)
7. Added a new span-based query API. This implements, among other
things, nested phrases. See javadocs for details. (Doug Cutting)
1.3 final 1.3 final

View File

@ -55,7 +55,9 @@ package org.apache.lucene.search;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Vector;
import java.util.Collection;
import java.util.Iterator;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
@ -296,14 +298,15 @@ public abstract class Similarity {
* <p>The default implementation sums the {@link #idf(Term,Searcher)} factor * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor
* for each term in the phrase. * for each term in the phrase.
* *
* @param terms the vector of terms in the phrase * @param terms the terms in the phrase
* @param searcher the document collection being searched * @param searcher the document collection being searched
* @return a score factor for the phrase * @return a score factor for the phrase
*/ */
public float idf(Vector terms, Searcher searcher) throws IOException { public float idf(Collection terms, Searcher searcher) throws IOException {
float idf = 0.0f; float idf = 0.0f;
for (int i = 0; i < terms.size(); i++) { Iterator i = terms.iterator();
idf += idf((Term)terms.elementAt(i), searcher); while (i.hasNext()) {
idf += idf((Term)i.next(), searcher);
} }
return idf; return idf;
} }

View File

@ -0,0 +1,236 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
class NearSpans implements Spans {
private SpanNearQuery query;
private List ordered = new ArrayList(); // spans in query order
private int slop; // from query
private boolean inOrder; // from query
private SpansCell first; // linked list of spans
private SpansCell last; // sorted by doc only
private int totalLength; // sum of current lengths
private SpanQueue queue; // sorted queue of spans
private SpansCell max; // max element in queue
private boolean more = true; // true iff not done
private boolean firstTime = true; // true before first next()
private boolean queueStale = false; // true if queue not sorted
private boolean listStale = true; // true if list not sorted
/** Wraps a Spans, and can be used to form a linked list. */
private class SpansCell implements Spans {
private Spans spans;
private SpansCell next;
private int length = -1;
public SpansCell(Spans spans) { this.spans = spans; }
public boolean next() throws IOException {
if (length != -1) // subtract old length
totalLength -= length;
boolean more = spans.next(); // move to next
if (more) {
length = end() - start(); // compute new length
totalLength += length; // add new length to total
if (max == null || doc() > max.doc() || // maintain max
(doc() == max.doc() && end() > max.end()))
max = this;
}
return more;
}
public boolean skipTo(int target) throws IOException {
if (length != -1) // subtract old length
totalLength -= length;
boolean more = spans.skipTo(target); // skip
if (more) {
length = end() - start(); // compute new length
totalLength += length; // add new length to total
if (max == null || doc() > max.doc() || // maintain max
(doc() == max.doc() && end() > max.end()))
max = this;
}
return more;
}
public int doc() { return spans.doc(); }
public int start() { return spans.start(); }
public int end() { return spans.end(); }
public String toString() { return spans.toString(); }
}
public NearSpans(SpanNearQuery query, IndexReader reader)
throws IOException {
this.query = query;
this.slop = query.getSlop();
this.inOrder = query.isInOrder();
SpanQuery[] clauses = query.getClauses(); // initialize spans & list
queue = new SpanQueue(clauses.length);
for (int i = 0; i < clauses.length; i++) {
SpansCell cell = // construct clause spans
new SpansCell(clauses[i].getSpans(reader));
ordered.add(cell); // add to ordered
}
}
public boolean next() throws IOException {
if (firstTime) {
initList(true);
listToQueue(); // initialize queue
firstTime = false;
} else {
more = last.next(); // trigger scan
queueStale = true;
}
while (more) {
if (listStale) { // maintain list
queueToList();
listStale = false;
}
// skip to doc w/ all clauses
while (more && first.doc() < last.doc()) {
more = first.skipTo(last.doc()); // skip first upto last
firstToLast(); // and move it to the end
queueStale = true;
}
if (!more) return false;
// found doc w/ all clauses
if (queueStale) { // maintain the queue
listToQueue();
queueStale = false;
}
int matchLength = max.end() - min().start();
if (((matchLength - totalLength) <= slop) // check slop
&& (!inOrder || matchIsOrdered())) { // check order
return true;
}
more = min().next(); // trigger further scanning
if (more) {
queue.adjustTop(); // maintain queue
if (min().doc() != max.doc()) {
listStale = true; // maintain list
}
}
}
return false; // no more matches
}
public boolean skipTo(int target) throws IOException {
if (firstTime) {
initList(false);
firstTime = false;
}
for (SpansCell cell = first; more && cell!=null; cell=cell.next) {
more = cell.skipTo(target);
}
if (more) {
listToQueue();
listStale = true;
if (min().doc() == max.doc()) { // at a match?
int matchLength = max.end() - min().start();
if ((matchLength - totalLength) <= slop) {
return true;
}
}
return next(); // no, scan
}
return false;
}
private SpansCell min() { return (SpansCell)queue.top(); }
public int doc() { return min().doc(); }
public int start() { return min().start(); }
public int end() { return max.end(); }
public String toString() { return "spans(" + query.toString() + ")"; }
private void initList(boolean next) throws IOException {
for (int i = 0; more && i < ordered.size(); i++) {
SpansCell cell = (SpansCell)ordered.get(i);
if (next)
more = cell.next(); // move to first entry
if (more) {
addToList(cell); // add to list
}
}
}
private void addToList(SpansCell cell) {
if (last != null) { // add next to end of list
last.next = cell;
} else
first = cell;
last = cell;
cell.next = null;
}
private void firstToLast() {
last.next = first; // move first to end of list
last = first;
first = first.next;
last.next = null;
}
private void queueToList() {
last = first = null;
while (queue.top() != null) {
addToList((SpansCell)queue.pop());
}
}
private void listToQueue() {
queue.clear();
for (SpansCell cell = first; cell != null; cell = cell.next) {
queue.put(cell); // build queue from list
}
}
private boolean matchIsOrdered() {
int lastStart = -1;
for (int i = 0; i < ordered.size(); i++) {
int start = ((SpansCell)ordered.get(i)).start();
if (!(start > lastStart))
return false;
lastStart = start;
}
return true;
}
}

View File

@ -0,0 +1,74 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.index.IndexReader;
/** Matches spans near the beginning of a field. */
public class SpanFirstQuery extends SpanQuery {
private SpanQuery match;
private int end;
/** Construct a SpanFirstQuery matching spans in <code>match</code> whose end
* position is less than or equal to <code>end</code>. */
public SpanFirstQuery(SpanQuery match, int end) {
this.match = match;
this.end = end;
}
/** Return the SpanQuery whose matches are filtered. */
public SpanQuery getMatch() { return match; }
/** Return the maximum end position permitted in a match. */
public int getEnd() { return end; }
public String getField() { return match.getField(); }
public Collection getTerms() { return match.getTerms(); }
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
buffer.append("spanFirst(");
buffer.append(match.toString(field));
buffer.append(", ");
buffer.append(end);
buffer.append(")");
return buffer.toString();
}
public Spans getSpans(final IndexReader reader) throws IOException {
return new Spans() {
private Spans spans = match.getSpans(reader);
public boolean next() throws IOException {
while (spans.next()) { // scan to next match
if (end() <= end)
return true;
}
return false;
}
public boolean skipTo(int target) throws IOException {
if (!spans.skipTo(target))
return false;
if (spans.end() <= end) // there is a match
return true;
return next(); // scan to next match
}
public int doc() { return spans.doc(); }
public int start() { return spans.start(); }
public int end() { return spans.end(); }
public String toString() {
return "spans(" + SpanFirstQuery.this.toString() + ")";
}
};
}
}

View File

@ -0,0 +1,97 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
/** Matches spans which are near one another. One can specify <i>slop</i>, the
* maximum number of intervening unmatched positions, as well as whether
* matches are required to be in-order. */
public class SpanNearQuery extends SpanQuery {
private List clauses;
private int slop;
private boolean inOrder;
private String field;
/** Construct a SpanNearQuery. Matches spans matching a span from each
* clause, with up to <code>slop</code> total unmatched positions between
* them. * When <code>inOrder</code> is true, the spans from each clause
* must be * ordered as in <code>clauses</code>. */
public SpanNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) {
// copy clauses array into an ArrayList
this.clauses = new ArrayList(clauses.length);
for (int i = 0; i < clauses.length; i++) {
SpanQuery clause = clauses[i];
if (i == 0) { // check field
field = clause.getField();
} else if (!clause.getField().equals(field)) {
throw new IllegalArgumentException("Clauses must have same field.");
}
this.clauses.add(clause);
}
this.slop = slop;
this.inOrder = inOrder;
}
/** Return the clauses whose spans are matched. */
public SpanQuery[] getClauses() {
return (SpanQuery[])clauses.toArray(new SpanQuery[clauses.size()]);
}
/** Return the maximum number of intervening unmatched positions permitted.*/
public int getSlop() { return slop; }
/** Return true if matches are required to be in-order.*/
public boolean isInOrder() { return inOrder; }
public String getField() { return field; }
public Collection getTerms() {
Collection terms = new ArrayList();
Iterator i = clauses.iterator();
while (i.hasNext()) {
SpanQuery clause = (SpanQuery)i.next();
terms.addAll(clause.getTerms());
}
return terms;
}
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
buffer.append("spanNear([");
Iterator i = clauses.iterator();
while (i.hasNext()) {
SpanQuery clause = (SpanQuery)i.next();
buffer.append(clause.toString(field));
if (i.hasNext()) {
buffer.append(", ");
}
}
buffer.append("], ");
buffer.append(slop);
buffer.append(", ");
buffer.append(inOrder);
buffer.append(")");
return buffer.toString();
}
public Spans getSpans(final IndexReader reader) throws IOException {
if (clauses.size() == 0) // optimize 0-clause case
return new SpanOrQuery(getClauses()).getSpans(reader);
if (clauses.size() == 1) // optimize 1-clause case
return ((SpanQuery)clauses.get(0)).getSpans(reader);
return new NearSpans(this, reader);
}
}

View File

@ -0,0 +1,114 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.index.IndexReader;
/** Removes matches which overlap with another SpanQuery. */
public class SpanNotQuery extends SpanQuery {
private SpanQuery include;
private SpanQuery exclude;
/** Construct a SpanNotQuery matching spans from <code>include</code> which
* have no overlap with spans from <code>exclude</code>.*/
public SpanNotQuery(SpanQuery include, SpanQuery exclude) {
this.include = include;
this.exclude = exclude;
if (!include.getField().equals(exclude.getField()))
throw new IllegalArgumentException("Clauses must have same field.");
}
/** Return the SpanQuery whose matches are filtered. */
public SpanQuery getInclude() { return include; }
/** Return the SpanQuery whose matches must not overlap those returned. */
public SpanQuery getExclude() { return exclude; }
public String getField() { return include.getField(); }
public Collection getTerms() { return include.getTerms(); }
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
buffer.append("spanNot(");
buffer.append(include.toString(field));
buffer.append(", ");
buffer.append(exclude.toString(field));
buffer.append(")");
return buffer.toString();
}
public Spans getSpans(final IndexReader reader) throws IOException {
return new Spans() {
private Spans includeSpans = include.getSpans(reader);
private boolean moreInclude = true;
private Spans excludeSpans = exclude.getSpans(reader);
private boolean moreExclude = true;
public boolean next() throws IOException {
if (moreInclude) // move to next include
moreInclude = includeSpans.next();
while (moreInclude && moreExclude) {
if (includeSpans.doc() > excludeSpans.doc()) // skip exclude
moreExclude = excludeSpans.skipTo(includeSpans.doc());
while (moreExclude // while exclude is before
&& includeSpans.doc() == excludeSpans.doc()
&& excludeSpans.end() <= includeSpans.start()) {
moreExclude = excludeSpans.next(); // increment exclude
}
if (!moreExclude // if no intersection
|| includeSpans.doc() != excludeSpans.doc()
|| includeSpans.end() <= excludeSpans.start())
break; // we found a match
moreInclude = includeSpans.next(); // intersected: keep scanning
}
return moreInclude;
}
public boolean skipTo(int target) throws IOException {
if (moreInclude) // skip include
moreInclude = includeSpans.skipTo(target);
if (!moreInclude)
return false;
if (moreExclude // skip exclude
&& includeSpans.doc() > excludeSpans.doc())
moreExclude = excludeSpans.skipTo(includeSpans.doc());
while (moreExclude // while exclude is before
&& includeSpans.doc() == excludeSpans.doc()
&& excludeSpans.end() <= includeSpans.start()) {
moreExclude = excludeSpans.next(); // increment exclude
}
if (!moreExclude // if no intersection
|| includeSpans.doc() != excludeSpans.doc()
|| includeSpans.end() <= excludeSpans.start())
return true; // we found a match
return next(); // scan to next match
}
public int doc() { return includeSpans.doc(); }
public int start() { return includeSpans.start(); }
public int end() { return includeSpans.end(); }
public String toString() {
return "spans(" + SpanNotQuery.this.toString() + ")";
}
};
}
}

View File

@ -0,0 +1,132 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.List;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
/** Matches the union of its clauses.*/
public class SpanOrQuery extends SpanQuery {
private List clauses;
private String field;
/** Construct a SpanOrQuery merging the provided clauses. */
public SpanOrQuery(SpanQuery[] clauses) {
// copy clauses array into an ArrayList
this.clauses = new ArrayList(clauses.length);
for (int i = 0; i < clauses.length; i++) {
SpanQuery clause = clauses[i];
if (i == 0) { // check field
field = clause.getField();
} else if (!clause.getField().equals(field)) {
throw new IllegalArgumentException("Clauses must have same field.");
}
this.clauses.add(clause);
}
}
/** Return the clauses whose spans are matched. */
public SpanQuery[] getClauses() {
return (SpanQuery[])clauses.toArray(new SpanQuery[clauses.size()]);
}
public String getField() { return field; }
public Collection getTerms() {
Collection terms = new ArrayList();
Iterator i = clauses.iterator();
while (i.hasNext()) {
SpanQuery clause = (SpanQuery)i.next();
terms.addAll(clause.getTerms());
}
return terms;
}
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
buffer.append("spanOr([");
Iterator i = clauses.iterator();
while (i.hasNext()) {
SpanQuery clause = (SpanQuery)i.next();
buffer.append(clause.toString(field));
if (i.hasNext()) {
buffer.append(", ");
}
}
buffer.append("])");
return buffer.toString();
}
public Spans getSpans(final IndexReader reader) throws IOException {
if (clauses.size() == 1) // optimize 1-clause case
return ((SpanQuery)clauses.get(0)).getSpans(reader);
return new Spans() {
private List all = new ArrayList(clauses.size());
private SpanQueue queue = new SpanQueue(clauses.size());
{
Iterator i = clauses.iterator();
while (i.hasNext()) { // initialize all
all.add(((SpanQuery)i.next()).getSpans(reader));
}
}
private boolean firstTime = true;
public boolean next() throws IOException {
if (firstTime) { // first time -- initialize
for (int i = 0; i < all.size(); i++) {
Spans spans = (Spans)all.get(i);
if (spans.next()) { // move to first entry
queue.put(spans); // build queue
}
}
firstTime = false;
return queue.size() != 0;
}
if (queue.size() == 0) { // all done
return false;
}
if (top().next()) { // move to next
queue.adjustTop();
return true;
}
queue.pop(); // exhausted a clause
return queue.size() != 0;
}
private Spans top() { return (Spans)queue.top(); }
public boolean skipTo(int target) throws IOException {
queue.clear(); // clear the queue
for (int i = 0; i < all.size(); i++) {
Spans spans = (Spans)all.get(i);
if (spans.skipTo(target)) { // skip each spans in all
queue.put(spans); // rebuild queue
}
}
firstTime = false;
return queue.size() != 0;
}
public int doc() { return top().doc(); }
public int start() { return top().start(); }
public int end() { return top().end(); }
public String toString() {
return "spans(" + SpanOrQuery.this.toString() + ")";
}
};
}
}

View File

@ -0,0 +1,29 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Searcher;
/** Base class for span-based queries. */
public abstract class SpanQuery extends Query {
/** Expert: Returns the matches for this query in an index. Used internally
* to search for spans. */
public abstract Spans getSpans(IndexReader reader) throws IOException;
/** Returns the name of the field matched by this query.*/
public abstract String getField();
/** Returns a collection of all terms matched by this query.*/
public abstract Collection getTerms();
protected Weight createWeight(Searcher searcher) {
return new SpanWeight(this, searcher);
}
}

View File

@ -0,0 +1,23 @@
package org.apache.lucene.search.spans;
import org.apache.lucene.util.PriorityQueue;
class SpanQueue extends PriorityQueue {
public SpanQueue(int size) {
initialize(size);
}
protected final boolean lessThan(Object o1, Object o2) {
Spans spans1 = (Spans)o1;
Spans spans2 = (Spans)o2;
if (spans1.doc() == spans2.doc()) {
if (spans1.start() == spans2.start()) {
return spans1.end() < spans2.end();
} else {
return spans1.start() < spans2.start();
}
} else {
return spans1.doc() < spans2.doc();
}
}
}

View File

@ -0,0 +1,89 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Similarity;
class SpanScorer extends Scorer {
private Spans spans;
private Weight weight;
private byte[] norms;
private float value;
private boolean firstTime = true;
private boolean more = true;
private int doc;
private float freq;
SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
throws IOException {
super(similarity);
this.spans = spans;
this.norms = norms;
this.weight = weight;
this.value = weight.getValue();
}
public boolean next() throws IOException {
if (firstTime) {
more = spans.next();
firstTime = false;
}
if (!more) return false;
freq = 0.0f;
doc = spans.doc();
while (more && doc == spans.doc()) {
int matchLength = spans.end() - spans.start();
freq += getSimilarity().sloppyFreq(matchLength);
more = spans.next();
}
return more || freq != 0.0f;
}
public int doc() { return doc; }
public float score() throws IOException {
float raw = getSimilarity().tf(freq) * value; // raw score
return raw * Similarity.decodeNorm(norms[doc]); // normalize
}
public boolean skipTo(int target) throws IOException {
more = spans.skipTo(target);
if (!more) return false;
freq = 0.0f;
doc = spans.doc();
while (more && spans.doc() == target) {
freq += getSimilarity().sloppyFreq(spans.end() - spans.start());
more = spans.next();
}
return more || freq != 0.0f;
}
public Explanation explain(final int doc) throws IOException {
Explanation tfExplanation = new Explanation();
skipTo(doc);
float phraseFreq = (doc() == doc) ? freq : 0.0f;
tfExplanation.setValue(getSimilarity().tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
return tfExplanation;
}
}

View File

@ -0,0 +1,84 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Collection;
import java.util.ArrayList;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
/** Matches spans containing a term. */
public class SpanTermQuery extends SpanQuery {
private Term term;
/** Construct a SpanTermQuery matching the named term's spans. */
public SpanTermQuery(Term term) { this.term = term; }
/** Return the term whose spans are matched. */
public Term getTerm() { return term; }
public String getField() { return term.field(); }
public Collection getTerms() {
Collection terms = new ArrayList();
terms.add(term);
return terms;
}
public String toString(String field) {
if (term.field().equals(field))
return term.text();
else
return term.toString();
}
public Spans getSpans(final IndexReader reader) throws IOException {
return new Spans() {
private TermPositions positions = reader.termPositions(term);
private int doc;
private int freq;
private int count;
private int position;
public boolean next() throws IOException {
if (count == freq) {
if (!positions.next())
return false;
doc = positions.doc();
freq = positions.freq();
count = 0;
}
position = positions.nextPosition();
count++;
return true;
}
public boolean skipTo(int target) throws IOException {
if (!positions.skipTo(target))
return false;
doc = positions.doc();
freq = positions.freq();
count = 0;
position = positions.nextPosition();
count++;
return true;
}
public int doc() { return doc; }
public int start() { return position; }
public int end() { return position + 1; }
public String toString() {
return "spans(" + SpanTermQuery.this.toString() + ")";
}
};
}
}

View File

@ -0,0 +1,127 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Iterator;
import java.util.Collection;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Similarity;
class SpanWeight implements Weight {
private Searcher searcher;
private float value;
private float idf;
private float queryNorm;
private float queryWeight;
private Collection terms;
private SpanQuery query;
public SpanWeight(SpanQuery query, Searcher searcher) {
this.searcher = searcher;
this.query = query;
this.terms = query.getTerms();
}
public Query getQuery() { return query; }
public float getValue() { return value; }
public float sumOfSquaredWeights() throws IOException {
idf = searcher.getSimilarity().idf(terms, searcher);
queryWeight = idf * query.getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
}
public void normalize(float queryNorm) {
this.queryNorm = queryNorm;
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
}
public Scorer scorer(IndexReader reader) throws IOException {
return new SpanScorer(query.getSpans(reader), this,
searcher.getSimilarity(),
reader.norms(query.getField()));
}
public Explanation explain(IndexReader reader, int doc)
throws IOException {
Explanation result = new Explanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
String field = ((SpanQuery)getQuery()).getField();
StringBuffer docFreqs = new StringBuffer();
Iterator i = terms.iterator();
while (i.hasNext()) {
Term term = (Term)i.next();
docFreqs.append(term.text());
docFreqs.append("=");
docFreqs.append(searcher.docFreq(term));
if (i.hasNext()) {
docFreqs.append(" ");
}
}
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ": " + docFreqs + ")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getQuery().getBoost(), "boost");
if (getQuery().getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
Explanation fieldExpl = new Explanation();
fieldExpl.setDescription("fieldWeight("+field+":"+query.toString(field)+
" in "+doc+"), product of:");
Explanation tfExpl = scorer(reader).explain(doc);
fieldExpl.addDetail(tfExpl);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = reader.norms(field);
float fieldNorm =
fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 0.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setValue(tfExpl.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
}
}

View File

@ -0,0 +1,37 @@
package org.apache.lucene.search.spans;
import java.io.IOException;
/** Expert: an enumeration of span matches. Used to implement span searching.
* Each span represents a range of term positions within a document. Matches
* are enumerated in order, by increasing document number, within that by
* increasing start position and finally by increasing end position. */
public interface Spans {
/** Move to the next match, returning true iff any such exists. */
boolean next() throws IOException;
/** Skips to the first match beyond the current whose document number is
* greater than or equal to <i>target</i>. <p>Returns true iff there is such
* a match. <p>Behaves as if written: <pre>
* boolean skipTo(int target) {
* do {
* if (!next())
* return false;
* } while (target > doc());
* return true;
* }
* </pre>
* Most implementations are considerably more efficient than that.
*/
boolean skipTo(int target) throws IOException;
/** Returns the document number of the current match. Initially invalid. */
int doc();
/** Returns the start position of the current match. Initially invalid. */
int start();
/** Returns the end position of the current match. Initially invalid. */
int end();
}

View File

@ -0,0 +1,7 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
<head></head>
<body>
The calculus of spans.
</body>
</html>

View File

@ -55,6 +55,12 @@ package org.apache.lucene.search;
*/ */
import junit.framework.TestCase; import junit.framework.TestCase;
import java.io.IOException;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.util.English; import org.apache.lucene.util.English;
import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
@ -63,9 +69,19 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.search.spans.*;
/** /**
* Tests basic search capabilities. * Tests basic search capabilities.
* *
* <p>Uses a collection of 1000 documents, each the english rendition of their
* document number. For example, the document numbered 333 has text "three
* hundred thirty three".
*
* <p>Tests are each a single query, and its hits are checked to ensure that
* all and only the correct documents are returned, thus providing end-to-end
* testing of the indexing and search code.
*
* @author Doug Cutting * @author Doug Cutting
*/ */
public class TestBasics extends TestCase { public class TestBasics extends TestCase {
@ -90,46 +106,181 @@ public class TestBasics extends TestCase {
public void testTerm() throws Exception { public void testTerm() throws Exception {
Query query = new TermQuery(new Term("field", "seventy")); Query query = new TermQuery(new Term("field", "seventy"));
Hits hits = searcher.search(query); checkHits(query, new int[]
assertEquals(100, hits.length()); {70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279,
370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 470, 471, 472, 473,
474, 475, 476, 477, 478, 479, 570, 571, 572, 573, 574, 575, 576, 577,
578, 579, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 770, 771,
772, 773, 774, 775, 776, 777, 778, 779, 870, 871, 872, 873, 874, 875,
876, 877, 878, 879, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979});
} }
public void testTerm2() throws Exception { public void testTerm2() throws Exception {
Query query = new TermQuery(new Term("field", "seventish")); Query query = new TermQuery(new Term("field", "seventish"));
Hits hits = searcher.search(query); checkHits(query, new int[] {});
assertEquals(0, hits.length());
} }
public void testPhrase() throws Exception { public void testPhrase() throws Exception {
PhraseQuery query = new PhraseQuery(); PhraseQuery query = new PhraseQuery();
query.add(new Term("field", "seventy")); query.add(new Term("field", "seventy"));
query.add(new Term("field", "seven")); query.add(new Term("field", "seven"));
Hits hits = searcher.search(query); checkHits(query, new int[]
assertEquals(10, hits.length()); {77, 177, 277, 377, 477, 577, 677, 777, 877, 977});
} }
public void testPhrase2() throws Exception { public void testPhrase2() throws Exception {
PhraseQuery query = new PhraseQuery(); PhraseQuery query = new PhraseQuery();
query.add(new Term("field", "seventish")); query.add(new Term("field", "seventish"));
query.add(new Term("field", "sevenon")); query.add(new Term("field", "sevenon"));
Hits hits = searcher.search(query); checkHits(query, new int[] {});
assertEquals(0, hits.length());
} }
public void testBoolean() throws Exception { public void testBoolean() throws Exception {
BooleanQuery query = new BooleanQuery(); BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("field", "seventy")), true, false); query.add(new TermQuery(new Term("field", "seventy")), true, false);
query.add(new TermQuery(new Term("field", "seven")), true, false); query.add(new TermQuery(new Term("field", "seven")), true, false);
Hits hits = searcher.search(query); checkHits(query, new int[]
assertEquals(19, hits.length()); {77, 777, 177, 277, 377, 477, 577, 677, 770, 771, 772, 773, 774, 775,
776, 778, 779, 877, 977});
} }
public void testBoolean2() throws Exception { public void testBoolean2() throws Exception {
BooleanQuery query = new BooleanQuery(); BooleanQuery query = new BooleanQuery();
query.add(new TermQuery(new Term("field", "sevento")), true, false); query.add(new TermQuery(new Term("field", "sevento")), true, false);
query.add(new TermQuery(new Term("field", "sevenly")), true, false); query.add(new TermQuery(new Term("field", "sevenly")), true, false);
checkHits(query, new int[] {});
}
public void testSpanNearExact() throws Exception {
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "seventy"));
SpanTermQuery term2 = new SpanTermQuery(new Term("field", "seven"));
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {term1, term2},
0, true);
checkHits(query, new int[]
{77, 177, 277, 377, 477, 577, 677, 777, 877, 977});
//System.out.println(searcher.explain(query, 77));
//System.out.println(searcher.explain(query, 977));
}
public void testSpanNearUnordered() throws Exception {
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "nine"));
SpanTermQuery term2 = new SpanTermQuery(new Term("field", "six"));
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {term1, term2},
4, false);
checkHits(query, new int[]
{609, 629, 639, 649, 659, 669, 679, 689, 699,
906, 926, 936, 946, 956, 966, 976, 986, 996});
}
public void testSpanNearOrdered() throws Exception {
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "nine"));
SpanTermQuery term2 = new SpanTermQuery(new Term("field", "six"));
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {term1, term2},
4, true);
checkHits(query, new int[]
{906, 926, 936, 946, 956, 966, 976, 986, 996});
}
public void testSpanNot() throws Exception {
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "eight"));
SpanTermQuery term2 = new SpanTermQuery(new Term("field", "one"));
SpanNearQuery near = new SpanNearQuery(new SpanQuery[] {term1, term2},
4, true);
SpanTermQuery term3 = new SpanTermQuery(new Term("field", "forty"));
SpanNotQuery query = new SpanNotQuery(near, term3);
checkHits(query, new int[]
{801, 821, 831, 851, 861, 871, 881, 891});
//System.out.println(searcher.explain(query, 801));
//System.out.println(searcher.explain(query, 891));
}
public void testSpanFirst() throws Exception {
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "five"));
SpanFirstQuery query = new SpanFirstQuery(term1, 1);
checkHits(query, new int[]
{5, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513,
514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527,
528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541,
542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555,
556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569,
570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583,
584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597,
598, 599});
//System.out.println(searcher.explain(query, 5));
//System.out.println(searcher.explain(query, 599));
}
public void testSpanOr() throws Exception {
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "thirty"));
SpanTermQuery term2 = new SpanTermQuery(new Term("field", "three"));
SpanNearQuery near1 = new SpanNearQuery(new SpanQuery[] {term1, term2},
0, true);
SpanTermQuery term3 = new SpanTermQuery(new Term("field", "forty"));
SpanTermQuery term4 = new SpanTermQuery(new Term("field", "seven"));
SpanNearQuery near2 = new SpanNearQuery(new SpanQuery[] {term3, term4},
0, true);
SpanOrQuery query = new SpanOrQuery(new SpanQuery[] {near1, near2});
checkHits(query, new int[]
{33, 47, 133, 147, 233, 247, 333, 347, 433, 447, 533, 547, 633, 647, 733,
747, 833, 847, 933, 947});
//System.out.println(searcher.explain(query, 33));
//System.out.println(searcher.explain(query, 947));
}
public void testSpanExactNested() throws Exception {
SpanTermQuery term1 = new SpanTermQuery(new Term("field", "three"));
SpanTermQuery term2 = new SpanTermQuery(new Term("field", "hundred"));
SpanNearQuery near1 = new SpanNearQuery(new SpanQuery[] {term1, term2},
0, true);
SpanTermQuery term3 = new SpanTermQuery(new Term("field", "thirty"));
SpanTermQuery term4 = new SpanTermQuery(new Term("field", "three"));
SpanNearQuery near2 = new SpanNearQuery(new SpanQuery[] {term3, term4},
0, true);
SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {near1, near2},
0, true);
checkHits(query, new int[] {333});
//System.out.println(searcher.explain(query, 333));
}
private void checkHits(Query query, int[] results) throws IOException {
Hits hits = searcher.search(query); Hits hits = searcher.search(query);
assertEquals(0, hits.length());
Set correct = new TreeSet();
for (int i = 0; i < results.length; i++) {
correct.add(new Integer(results[i]));
}
Set actual = new TreeSet();
for (int i = 0; i < hits.length(); i++) {
actual.add(new Integer(hits.id(i)));
}
assertEquals(query.toString("field"), correct, actual);
}
private void printHits(Query query) throws IOException {
Hits hits = searcher.search(query);
System.out.print("new int[] {");
for (int i = 0; i < hits.length(); i++) {
System.out.print(hits.id(i));
if (i != hits.length()-1)
System.out.print(", ");
}
System.out.println("}");
} }
} }

View File

@ -56,7 +56,7 @@ package org.apache.lucene.search;
import junit.framework.TestCase; import junit.framework.TestCase;
import java.util.Vector; import java.util.Collection;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
@ -81,7 +81,7 @@ public class TestSimilarity extends TestCase {
public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
public float tf(float freq) { return freq; } public float tf(float freq) { return freq; }
public float sloppyFreq(int distance) { return 2.0f; } public float sloppyFreq(int distance) { return 2.0f; }
public float idf(Vector terms, Searcher searcher) { return 1.0f; } public float idf(Collection terms, Searcher searcher) { return 1.0f; }
public float idf(int docFreq, int numDocs) { return 1.0f; } public float idf(int docFreq, int numDocs) { return 1.0f; }
public float coord(int overlap, int maxOverlap) { return 1.0f; } public float coord(int overlap, int maxOverlap) { return 1.0f; }
} }