SOLR-1179: add DocSet.getTopFilter()

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@777269 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-05-21 21:30:38 +00:00
parent 87e52df974
commit 3c16e08dbd
3 changed files with 304 additions and 2 deletions

View File

@ -19,6 +19,12 @@ package org.apache.solr.search;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
/** /**
* <code>DocSet</code> represents an unordered set of Lucene Document Ids. * <code>DocSet</code> represents an unordered set of Lucene Document Ids.
@ -135,6 +141,13 @@ public interface DocSet /* extends Collection<Integer> */ {
* Returns the number of documents in this set that are not in the other set. * Returns the number of documents in this set that are not in the other set.
*/ */
public int andNotSize(DocSet other); public int andNotSize(DocSet other);
/**
* Returns a Filter for use in Lucene search methods, assuming this DocSet
* was generated from the top-level MultiReader that the Lucene search
* methods will be invoked with.
*/
public Filter getTopFilter();
} }
/** A base class that may be usefull for implementing DocSets */ /** A base class that may be usefull for implementing DocSets */
@ -230,6 +243,49 @@ abstract class DocSetBase implements DocSet {
public int andNotSize(DocSet other) { public int andNotSize(DocSet other) {
return this.size() - this.intersectionSize(other); return this.size() - this.intersectionSize(other);
} }
public Filter getTopFilter() {
final OpenBitSet bs = getBits();
return new Filter() {
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
int offset = 0;
SolrIndexReader r = (SolrIndexReader)reader;
while (r.getParent() != null) {
offset += r.getBase();
r = r.getParent();
}
if (r==reader) return bs;
final int base = offset;
final int maxDoc = reader.maxDoc();
final int max = base + maxDoc; // one past the max doc in this segment.
return new DocIdSet() {
public DocIdSetIterator iterator() throws IOException {
return new DocIdSetIterator() {
int pos=base-1;
public int doc() {
return pos-base;
}
public boolean next() throws IOException {
pos = bs.nextSetBit(pos+1);
return pos>=0 && pos<max;
}
public boolean skipTo(int target) throws IOException {
pos = bs.nextSetBit(target+base);
return pos>=0 && pos<max;
}
};
}
};
}
};
}
} }

View File

@ -18,6 +18,12 @@
package org.apache.solr.search; package org.apache.solr.search;
import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
/** /**
* <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids. * <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
@ -493,6 +499,126 @@ public class SortedIntDocSet extends DocSetBase {
return bs; return bs;
} }
public static int findIndex(int[] arr, int value, int low, int high) {
// binary search
while (low <= high) {
int mid = (low+high) >>> 1;
int found = arr[mid];
if (found < value) {
low = mid+1;
}
else if (found > value) {
high = mid-1;
}
else {
return mid;
}
}
return low;
}
@Override
public Filter getTopFilter() {
return new Filter() {
int lastEndIdx = 0;
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
int offset = 0;
SolrIndexReader r = (SolrIndexReader)reader;
while (r.getParent() != null) {
offset += r.getBase();
r = r.getParent();
}
final int base = offset;
final int maxDoc = reader.maxDoc();
final int max = base + maxDoc; // one past the max doc in this segment.
int sidx = Math.max(0,lastEndIdx);
if (sidx > 0 && docs[sidx-1] >= base) {
// oops, the lastEndIdx isn't correct... we must have been used
// in a multi-threaded context, or the indexreaders are being
// used out-of-order. start at 0.
sidx = 0;
}
if (sidx < docs.length && docs[sidx] < base) {
// if docs[sidx] is < base, we need to seek to find the real start.
sidx = findIndex(docs, base, sidx, docs.length-1);
}
final int startIdx = sidx;
// Largest possible end index is limited to the start index
// plus the number of docs contained in the segment. Subtract 1 since
// the end index is inclusive.
int eidx = Math.min(docs.length, startIdx + maxDoc) - 1;
// find the real end
eidx = findIndex(docs, max, startIdx, eidx) - 1;
final int endIdx = eidx;
lastEndIdx = endIdx;
return new DocIdSet() {
public DocIdSetIterator iterator() throws IOException {
return new DocIdSetIterator() {
int idx = startIdx;
int doc;
public int doc() {
return doc - base;
}
public boolean next() throws IOException {
if (idx > endIdx) return false;
doc = docs[idx++];
return true;
}
public boolean skipTo(int target) throws IOException {
if (idx > endIdx) return false;
target += base;
// probe next
doc = docs[idx++];
if (doc >= target) return true;
int high = endIdx;
// TODO: probe more before resorting to binary search?
// binary search
while (idx <= high) {
int mid = (idx+high) >>> 1;
doc = docs[mid];
if (doc < target) {
idx = mid+1;
}
else if (doc > target) {
high = mid-1;
}
else {
idx=mid+1;
return true;
}
}
// low is on the insertion point...
if (idx <= endIdx) {
doc = docs[idx++];
return true;
} else {
return false;
}
}
};
}
};
}
};
}
} }

View File

@ -21,9 +21,16 @@ import junit.framework.TestCase;
import java.util.Random; import java.util.Random;
import java.util.Arrays; import java.util.Arrays;
import java.io.IOException;
import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetIterator; import org.apache.lucene.util.OpenBitSetIterator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
/** /**
* @version $Id$ * @version $Id$
@ -300,5 +307,118 @@ public class TestDocSet extends TestCase {
} }
***/ ***/
public IndexReader dummyIndexReader(final int maxDoc) {
IndexReader r = new FilterIndexReader(null) {
@Override
public int maxDoc() {
return maxDoc;
}
@Override
public boolean hasDeletions() {
return false;
}
@Override
public IndexReader[] getSequentialSubReaders() {
return null;
}
};
return r;
}
public IndexReader dummyMultiReader(int nSeg, int maxDoc) {
if (nSeg==1 && rand.nextBoolean()) return dummyIndexReader(rand.nextInt(maxDoc));
IndexReader[] subs = new IndexReader[rand.nextInt(nSeg)+1];
for (int i=0; i<subs.length; i++) {
subs[i] = dummyIndexReader(rand.nextInt(maxDoc));
}
MultiReader mr = new MultiReader(subs);
return mr;
}
public void doTestIteratorEqual(DocIdSet a, DocIdSet b) throws IOException {
DocIdSetIterator ia = a.iterator();
DocIdSetIterator ib = b.iterator();
// test for next() equivalence
for(;;) {
boolean nexta = ia.next();
boolean nextb = ib.next();
assertEquals(nexta, nextb);
if (!nexta) break;
assertEquals(ia.doc(), ib.doc());
}
for (int i=0; i<10; i++) {
// test random skipTo() and next()
ia = a.iterator();
ib = b.iterator();
int doc = -1;
for (;;) {
boolean nexta,nextb;
if (rand.nextBoolean()) {
nexta = ia.next();
nextb = ib.next();
} else {
int target = doc + rand.nextInt(10) + 1; // keep in mind future edge cases like probing (increase if necessary)
nexta = ia.skipTo(target);
nextb = ib.skipTo(target);
}
assertEquals(nexta, nextb);
if (!nexta) break;
doc = ia.doc();
assertEquals(doc, ib.doc());
}
}
}
public void doFilterTest(SolrIndexReader reader) throws IOException {
OpenBitSet bs = getRandomSet(reader.maxDoc(), rand.nextInt(reader.maxDoc()+1));
DocSet a = new BitDocSet(bs);
DocSet b = getIntDocSet(bs);
Filter fa = a.getTopFilter();
Filter fb = b.getTopFilter();
// test top-level
DocIdSet da = fa.getDocIdSet(reader);
DocIdSet db = fb.getDocIdSet(reader);
doTestIteratorEqual(da, db);
// first test in-sequence sub readers
for (SolrIndexReader sir : reader.getLeafReaders()) {
da = fa.getDocIdSet(sir);
db = fb.getDocIdSet(sir);
doTestIteratorEqual(da, db);
}
int nReaders = reader.getLeafReaders().length;
// now test out-of-sequence sub readers
for (int i=0; i<nReaders; i++) {
SolrIndexReader sir = reader.getLeafReaders()[rand.nextInt(nReaders)];
da = fa.getDocIdSet(sir);
db = fb.getDocIdSet(sir);
doTestIteratorEqual(da, db);
}
}
public void testFilter() throws IOException {
// keeping these numbers smaller help hit more edge cases
int maxSeg=4;
int maxDoc=5; // increase if future changes add more edge cases (like probing a certain distance in the bin search)
for (int i=0; i<5000; i++) {
IndexReader r = dummyMultiReader(maxSeg, maxDoc);
SolrIndexReader sir = new SolrIndexReader(r, null, 0);
doFilterTest(sir);
}
}
} }