mirror of https://github.com/apache/lucene.git
SOLR-1179: add DocSet.getTopFilter()
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@777269 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
87e52df974
commit
3c16e08dbd
|
@ -19,6 +19,12 @@ package org.apache.solr.search;
|
|||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <code>DocSet</code> represents an unordered set of Lucene Document Ids.
|
||||
|
@ -135,6 +141,13 @@ public interface DocSet /* extends Collection<Integer> */ {
|
|||
* Returns the number of documents in this set that are not in the other set.
|
||||
*/
|
||||
public int andNotSize(DocSet other);
|
||||
|
||||
/**
|
||||
* Returns a Filter for use in Lucene search methods, assuming this DocSet
|
||||
* was generated from the top-level MultiReader that the Lucene search
|
||||
* methods will be invoked with.
|
||||
*/
|
||||
public Filter getTopFilter();
|
||||
}
|
||||
|
||||
/** A base class that may be usefull for implementing DocSets */
|
||||
|
@ -230,6 +243,49 @@ abstract class DocSetBase implements DocSet {
|
|||
public int andNotSize(DocSet other) {
|
||||
return this.size() - this.intersectionSize(other);
|
||||
}
|
||||
|
||||
public Filter getTopFilter() {
|
||||
final OpenBitSet bs = getBits();
|
||||
|
||||
return new Filter() {
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
int offset = 0;
|
||||
SolrIndexReader r = (SolrIndexReader)reader;
|
||||
while (r.getParent() != null) {
|
||||
offset += r.getBase();
|
||||
r = r.getParent();
|
||||
}
|
||||
|
||||
if (r==reader) return bs;
|
||||
|
||||
final int base = offset;
|
||||
final int maxDoc = reader.maxDoc();
|
||||
final int max = base + maxDoc; // one past the max doc in this segment.
|
||||
|
||||
return new DocIdSet() {
|
||||
public DocIdSetIterator iterator() throws IOException {
|
||||
return new DocIdSetIterator() {
|
||||
int pos=base-1;
|
||||
public int doc() {
|
||||
return pos-base;
|
||||
}
|
||||
|
||||
public boolean next() throws IOException {
|
||||
pos = bs.nextSetBit(pos+1);
|
||||
return pos>=0 && pos<max;
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
pos = bs.nextSetBit(target+base);
|
||||
return pos>=0 && pos<max;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -18,6 +18,12 @@
|
|||
package org.apache.solr.search;
|
||||
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
|
||||
|
@ -493,6 +499,126 @@ public class SortedIntDocSet extends DocSetBase {
|
|||
return bs;
|
||||
}
|
||||
|
||||
|
||||
public static int findIndex(int[] arr, int value, int low, int high) {
|
||||
// binary search
|
||||
while (low <= high) {
|
||||
int mid = (low+high) >>> 1;
|
||||
int found = arr[mid];
|
||||
|
||||
if (found < value) {
|
||||
low = mid+1;
|
||||
}
|
||||
else if (found > value) {
|
||||
high = mid-1;
|
||||
}
|
||||
else {
|
||||
return mid;
|
||||
}
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Filter getTopFilter() {
|
||||
return new Filter() {
|
||||
int lastEndIdx = 0;
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
|
||||
int offset = 0;
|
||||
SolrIndexReader r = (SolrIndexReader)reader;
|
||||
while (r.getParent() != null) {
|
||||
offset += r.getBase();
|
||||
r = r.getParent();
|
||||
}
|
||||
final int base = offset;
|
||||
final int maxDoc = reader.maxDoc();
|
||||
final int max = base + maxDoc; // one past the max doc in this segment.
|
||||
int sidx = Math.max(0,lastEndIdx);
|
||||
|
||||
if (sidx > 0 && docs[sidx-1] >= base) {
|
||||
// oops, the lastEndIdx isn't correct... we must have been used
|
||||
// in a multi-threaded context, or the indexreaders are being
|
||||
// used out-of-order. start at 0.
|
||||
sidx = 0;
|
||||
}
|
||||
if (sidx < docs.length && docs[sidx] < base) {
|
||||
// if docs[sidx] is < base, we need to seek to find the real start.
|
||||
sidx = findIndex(docs, base, sidx, docs.length-1);
|
||||
}
|
||||
|
||||
final int startIdx = sidx;
|
||||
|
||||
// Largest possible end index is limited to the start index
|
||||
// plus the number of docs contained in the segment. Subtract 1 since
|
||||
// the end index is inclusive.
|
||||
int eidx = Math.min(docs.length, startIdx + maxDoc) - 1;
|
||||
|
||||
// find the real end
|
||||
eidx = findIndex(docs, max, startIdx, eidx) - 1;
|
||||
|
||||
final int endIdx = eidx;
|
||||
lastEndIdx = endIdx;
|
||||
|
||||
|
||||
return new DocIdSet() {
|
||||
public DocIdSetIterator iterator() throws IOException {
|
||||
return new DocIdSetIterator() {
|
||||
int idx = startIdx;
|
||||
int doc;
|
||||
public int doc() {
|
||||
return doc - base;
|
||||
}
|
||||
|
||||
public boolean next() throws IOException {
|
||||
if (idx > endIdx) return false;
|
||||
doc = docs[idx++];
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean skipTo(int target) throws IOException {
|
||||
if (idx > endIdx) return false;
|
||||
target += base;
|
||||
|
||||
// probe next
|
||||
doc = docs[idx++];
|
||||
if (doc >= target) return true;
|
||||
|
||||
int high = endIdx;
|
||||
|
||||
// TODO: probe more before resorting to binary search?
|
||||
|
||||
// binary search
|
||||
while (idx <= high) {
|
||||
int mid = (idx+high) >>> 1;
|
||||
doc = docs[mid];
|
||||
|
||||
if (doc < target) {
|
||||
idx = mid+1;
|
||||
}
|
||||
else if (doc > target) {
|
||||
high = mid-1;
|
||||
}
|
||||
else {
|
||||
idx=mid+1;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// low is on the insertion point...
|
||||
if (idx <= endIdx) {
|
||||
doc = docs[idx++];
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -21,9 +21,16 @@ import junit.framework.TestCase;
|
|||
|
||||
import java.util.Random;
|
||||
import java.util.Arrays;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.OpenBitSetIterator;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.FilterIndexReader;
|
||||
import org.apache.lucene.index.MultiReader;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
|
@ -300,5 +307,118 @@ public class TestDocSet extends TestCase {
|
|||
}
|
||||
***/
|
||||
|
||||
public IndexReader dummyIndexReader(final int maxDoc) {
|
||||
|
||||
IndexReader r = new FilterIndexReader(null) {
|
||||
@Override
|
||||
public int maxDoc() {
|
||||
return maxDoc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasDeletions() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public IndexReader[] getSequentialSubReaders() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
return r;
|
||||
}
|
||||
|
||||
public IndexReader dummyMultiReader(int nSeg, int maxDoc) {
|
||||
if (nSeg==1 && rand.nextBoolean()) return dummyIndexReader(rand.nextInt(maxDoc));
|
||||
|
||||
IndexReader[] subs = new IndexReader[rand.nextInt(nSeg)+1];
|
||||
for (int i=0; i<subs.length; i++) {
|
||||
subs[i] = dummyIndexReader(rand.nextInt(maxDoc));
|
||||
}
|
||||
|
||||
MultiReader mr = new MultiReader(subs);
|
||||
return mr;
|
||||
}
|
||||
|
||||
public void doTestIteratorEqual(DocIdSet a, DocIdSet b) throws IOException {
|
||||
DocIdSetIterator ia = a.iterator();
|
||||
DocIdSetIterator ib = b.iterator();
|
||||
|
||||
// test for next() equivalence
|
||||
for(;;) {
|
||||
boolean nexta = ia.next();
|
||||
boolean nextb = ib.next();
|
||||
assertEquals(nexta, nextb);
|
||||
if (!nexta) break;
|
||||
assertEquals(ia.doc(), ib.doc());
|
||||
|
||||
}
|
||||
|
||||
for (int i=0; i<10; i++) {
|
||||
// test random skipTo() and next()
|
||||
ia = a.iterator();
|
||||
ib = b.iterator();
|
||||
int doc = -1;
|
||||
for (;;) {
|
||||
boolean nexta,nextb;
|
||||
if (rand.nextBoolean()) {
|
||||
nexta = ia.next();
|
||||
nextb = ib.next();
|
||||
} else {
|
||||
int target = doc + rand.nextInt(10) + 1; // keep in mind future edge cases like probing (increase if necessary)
|
||||
nexta = ia.skipTo(target);
|
||||
nextb = ib.skipTo(target);
|
||||
}
|
||||
|
||||
assertEquals(nexta, nextb);
|
||||
if (!nexta) break;
|
||||
doc = ia.doc();
|
||||
assertEquals(doc, ib.doc());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void doFilterTest(SolrIndexReader reader) throws IOException {
|
||||
OpenBitSet bs = getRandomSet(reader.maxDoc(), rand.nextInt(reader.maxDoc()+1));
|
||||
DocSet a = new BitDocSet(bs);
|
||||
DocSet b = getIntDocSet(bs);
|
||||
|
||||
Filter fa = a.getTopFilter();
|
||||
Filter fb = b.getTopFilter();
|
||||
|
||||
// test top-level
|
||||
DocIdSet da = fa.getDocIdSet(reader);
|
||||
DocIdSet db = fb.getDocIdSet(reader);
|
||||
doTestIteratorEqual(da, db);
|
||||
|
||||
// first test in-sequence sub readers
|
||||
for (SolrIndexReader sir : reader.getLeafReaders()) {
|
||||
da = fa.getDocIdSet(sir);
|
||||
db = fb.getDocIdSet(sir);
|
||||
doTestIteratorEqual(da, db);
|
||||
}
|
||||
|
||||
int nReaders = reader.getLeafReaders().length;
|
||||
// now test out-of-sequence sub readers
|
||||
for (int i=0; i<nReaders; i++) {
|
||||
SolrIndexReader sir = reader.getLeafReaders()[rand.nextInt(nReaders)];
|
||||
da = fa.getDocIdSet(sir);
|
||||
db = fb.getDocIdSet(sir);
|
||||
doTestIteratorEqual(da, db);
|
||||
}
|
||||
}
|
||||
|
||||
public void testFilter() throws IOException {
|
||||
// keeping these numbers smaller help hit more edge cases
|
||||
int maxSeg=4;
|
||||
int maxDoc=5; // increase if future changes add more edge cases (like probing a certain distance in the bin search)
|
||||
|
||||
for (int i=0; i<5000; i++) {
|
||||
IndexReader r = dummyMultiReader(maxSeg, maxDoc);
|
||||
SolrIndexReader sir = new SolrIndexReader(r, null, 0);
|
||||
doFilterTest(sir);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue