SOLR-1179: add DocSet.getTopFilter()

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@777269 13f79535-47bb-0310-9956-ffa450edef68
2009-05-21 21:30:38 +00:00 · 2009-05-21 21:30:38 +00:00 · 3c16e08dbd
parent 87e52df974
commit 3c16e08dbd
3 changed files with 304 additions and 2 deletions
--- a/src/java/org/apache/solr/search/DocSet.java
+++ b/src/java/org/apache/solr/search/DocSet.java
@ -19,6 +19,12 @@ package org.apache.solr.search;

 import org.apache.solr.common.SolrException;
 import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;

 /**
 * <code>DocSet</code> represents an unordered set of Lucene Document Ids.
@ -135,6 +141,13 @@ public interface DocSet /* extends Collection<Integer> */ {
   * Returns the number of documents in this set that are not in the other set.
   */
  public int andNotSize(DocSet other);
+
+  /**
+   * Returns a Filter for use in Lucene search methods, assuming this DocSet
+   * was generated from the top-level MultiReader that the Lucene search
+   * methods will be invoked with.
+   */
+  public Filter getTopFilter();
 }

 /** A base class that may be usefull for implementing DocSets */
@ -230,6 +243,49 @@ abstract class DocSetBase implements DocSet {
  public int andNotSize(DocSet other) {
    return this.size() - this.intersectionSize(other);
  }
+
+  public Filter getTopFilter() {
+    final OpenBitSet bs = getBits();
+
+    return new Filter() {
+      @Override
+      public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+        int offset = 0;
+        SolrIndexReader r = (SolrIndexReader)reader;
+        while (r.getParent() != null) {
+          offset += r.getBase();
+          r = r.getParent();
+        }
+
+        if (r==reader) return bs;
+
+        final int base = offset;
+        final int maxDoc = reader.maxDoc();
+        final int max = base + maxDoc;   // one past the max doc in this segment.
+
+        return new DocIdSet() {
+          public DocIdSetIterator iterator() throws IOException {
+            return new DocIdSetIterator() {
+              int pos=base-1;
+              public int doc() {
+                return pos-base;
+              }
+
+              public boolean next() throws IOException {
+                pos = bs.nextSetBit(pos+1);
+                return pos>=0 && pos<max;
+              }
+
+              public boolean skipTo(int target) throws IOException {
+                pos = bs.nextSetBit(target+base);
+                return pos>=0 && pos<max;
+              }
+            };
+          }
+        };
+      }
+    };
+  }
 }


--- a/src/java/org/apache/solr/search/SortedIntDocSet.java
+++ b/src/java/org/apache/solr/search/SortedIntDocSet.java
@ -18,6 +18,12 @@
 package org.apache.solr.search;

 import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;

 /**
 * <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
@ -493,6 +499,126 @@ public class SortedIntDocSet extends DocSetBase {
    return bs;
  }

+
+  public static int findIndex(int[] arr, int value, int low, int high) {
+    // binary search
+    while (low <= high) {
+      int mid = (low+high) >>> 1;
+      int found = arr[mid];
+
+      if (found < value) {
+        low = mid+1;
+      }
+      else if (found > value) {
+        high = mid-1;
+      }
+      else {
+        return mid;
+      }
+    }
+    return low;
+  }
+
+  @Override
+  public Filter getTopFilter() {
+    return new Filter() {
+      int lastEndIdx = 0;
+
+      @Override
+      public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+        int offset = 0;
+        SolrIndexReader r = (SolrIndexReader)reader;
+        while (r.getParent() != null) {
+          offset += r.getBase();
+          r = r.getParent();
+        }
+        final int base = offset;
+        final int maxDoc = reader.maxDoc();
+        final int max = base + maxDoc;   // one past the max doc in this segment.
+        int sidx = Math.max(0,lastEndIdx);
+
+        if (sidx > 0 && docs[sidx-1] >= base) {
+          // oops, the lastEndIdx isn't correct... we must have been used
+          // in a multi-threaded context, or the indexreaders are being
+          // used out-of-order.  start at 0.
+          sidx = 0;
+        }
+        if (sidx < docs.length && docs[sidx] < base) {
+          // if docs[sidx] is < base, we need to seek to find the real start.
+          sidx = findIndex(docs, base, sidx, docs.length-1);
+        }
+
+        final int startIdx = sidx;
+
+        // Largest possible end index is limited to the start index
+        // plus the number of docs contained in the segment.  Subtract 1 since
+        // the end index is inclusive.
+        int eidx = Math.min(docs.length, startIdx + maxDoc) - 1;
+
+        // find the real end
+        eidx = findIndex(docs, max, startIdx, eidx) - 1;
+
+        final int endIdx = eidx;
+        lastEndIdx = endIdx;
+
+
+        return new DocIdSet() {
+          public DocIdSetIterator iterator() throws IOException {
+            return new DocIdSetIterator() {
+              int idx = startIdx;
+              int doc;
+              public int doc() {
+                return doc - base;
+              }
+
+              public boolean next() throws IOException {
+                if (idx > endIdx) return false;
+                doc = docs[idx++];
+                return true;
+              }
+
+              public boolean skipTo(int target) throws IOException {
+                if (idx > endIdx) return false;
+                target += base;
+
+                // probe next
+                doc = docs[idx++];
+                if (doc >= target) return true;
+
+                int high = endIdx;
+
+                // TODO: probe more before resorting to binary search?
+
+                // binary search
+                while (idx <= high) {
+                  int mid = (idx+high) >>> 1;
+                  doc = docs[mid];
+
+                  if (doc < target) {
+                    idx = mid+1;
+                  }
+                  else if (doc > target) {
+                    high = mid-1;
+                  }
+                  else {
+                    idx=mid+1;
+                    return true;
+                  }
+                }
+
+                // low is on the insertion point...
+                if (idx <= endIdx) {
+                  doc = docs[idx++];
+                  return true;
+                } else {
+                  return false;
+                }
+              }
+            };
+          }
+        };
+      }
+    };
+  }
+
 }
-
-
--- a/src/test/org/apache/solr/search/TestDocSet.java
+++ b/src/test/org/apache/solr/search/TestDocSet.java
@ -21,9 +21,16 @@ import junit.framework.TestCase;

 import java.util.Random;
 import java.util.Arrays;
+import java.io.IOException;

 import org.apache.lucene.util.OpenBitSet;
 import org.apache.lucene.util.OpenBitSetIterator;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.FilterIndexReader;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;

 /**
 * @version $Id$
@ -300,5 +307,118 @@ public class TestDocSet extends TestCase {
  }
  ***/

+  public IndexReader dummyIndexReader(final int maxDoc) {
+
+    IndexReader r = new FilterIndexReader(null) {
+      @Override
+      public int maxDoc() {
+        return maxDoc;
+      }
+
+      @Override
+      public boolean hasDeletions() {
+        return false;
+      }
+
+      @Override
+      public IndexReader[] getSequentialSubReaders() {
+        return null;
+      }
+    };
+    return r;
+  }
+
+  public IndexReader dummyMultiReader(int nSeg, int maxDoc) {
+    if (nSeg==1 && rand.nextBoolean()) return dummyIndexReader(rand.nextInt(maxDoc));
+
+    IndexReader[] subs = new IndexReader[rand.nextInt(nSeg)+1];
+    for (int i=0; i<subs.length; i++) {
+      subs[i] = dummyIndexReader(rand.nextInt(maxDoc));
+    }
+
+    MultiReader mr = new MultiReader(subs);
+    return mr;
+  }
+
+  public void doTestIteratorEqual(DocIdSet a, DocIdSet b) throws IOException {
+    DocIdSetIterator ia = a.iterator();
+    DocIdSetIterator ib = b.iterator();
+
+    // test for next() equivalence
+    for(;;) {
+      boolean nexta = ia.next();
+      boolean nextb = ib.next();
+      assertEquals(nexta, nextb);
+      if (!nexta) break;
+      assertEquals(ia.doc(), ib.doc());
+
+    }
+
+    for (int i=0; i<10; i++) {
+      // test random skipTo() and next()
+      ia = a.iterator();
+      ib = b.iterator();
+      int doc = -1;
+      for (;;) {
+        boolean nexta,nextb;
+        if (rand.nextBoolean()) {
+          nexta = ia.next();
+          nextb = ib.next();
+        } else {
+          int target = doc + rand.nextInt(10) + 1;  // keep in mind future edge cases like probing (increase if necessary)
+          nexta = ia.skipTo(target);
+          nextb = ib.skipTo(target);
+        }
+
+        assertEquals(nexta, nextb);        
+        if (!nexta) break;
+        doc = ia.doc();
+        assertEquals(doc, ib.doc());
+      }
+    }
+  }
+
+  public void doFilterTest(SolrIndexReader reader) throws IOException {
+    OpenBitSet bs = getRandomSet(reader.maxDoc(), rand.nextInt(reader.maxDoc()+1));
+    DocSet a = new BitDocSet(bs);
+    DocSet b = getIntDocSet(bs);
+
+    Filter fa = a.getTopFilter();
+    Filter fb = b.getTopFilter();
+
+    // test top-level
+    DocIdSet da = fa.getDocIdSet(reader);
+    DocIdSet db = fb.getDocIdSet(reader);
+    doTestIteratorEqual(da, db);
+
+    // first test in-sequence sub readers
+    for (SolrIndexReader sir : reader.getLeafReaders()) {
+      da = fa.getDocIdSet(sir);
+      db = fb.getDocIdSet(sir);
+      doTestIteratorEqual(da, db);
+    }  
+
+    int nReaders = reader.getLeafReaders().length;
+    // now test out-of-sequence sub readers
+    for (int i=0; i<nReaders; i++) {
+      SolrIndexReader sir = reader.getLeafReaders()[rand.nextInt(nReaders)];
+      da = fa.getDocIdSet(sir);
+      db = fb.getDocIdSet(sir);
+      doTestIteratorEqual(da, db);
+    }
+  }
+
+  public void testFilter() throws IOException {
+    // keeping these numbers smaller help hit more edge cases
+    int maxSeg=4;
+    int maxDoc=5;    // increase if future changes add more edge cases (like probing a certain distance in the bin search)
+
+    for (int i=0; i<5000; i++) {
+      IndexReader r = dummyMultiReader(maxSeg, maxDoc);
+      SolrIndexReader sir = new SolrIndexReader(r, null, 0);
+      doFilterTest(sir);
+    }
+  }
+

 }