SOLR-1179: add DocSet.getTopFilter()

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@777269 13f79535-47bb-0310-9956-ffa450edef68
2009-05-21 21:30:38 +00:00 · 2009-05-21 21:30:38 +00:00 · 3c16e08dbd
parent 87e52df974
commit 3c16e08dbd
3 changed files with 304 additions and 2 deletions
--- a/src/java/org/apache/solr/search/DocSet.java
+++ b/src/java/org/apache/solr/search/DocSet.java
@ -19,6 +19,12 @@ package org.apache.solr.search;
 import org.apache.solr.common.SolrException;
 import org.apache.lucene.util.OpenBitSet;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.index.IndexReader;
 import java.io.IOException;
 /**
 * <code>DocSet</code> represents an unordered set of Lucene Document Ids.
@ -135,6 +141,13 @@ public interface DocSet /* extends Collection<Integer> */ {
   * Returns the number of documents in this set that are not in the other set.
   */
  public int andNotSize(DocSet other);
  /**
   * Returns a Filter for use in Lucene search methods, assuming this DocSet
   * was generated from the top-level MultiReader that the Lucene search
   * methods will be invoked with.
   */
  public Filter getTopFilter();
 }
 /** A base class that may be usefull for implementing DocSets */
@ -230,6 +243,49 @@ abstract class DocSetBase implements DocSet {
  public int andNotSize(DocSet other) {
    return this.size() - this.intersectionSize(other);
  }
  public Filter getTopFilter() {
    final OpenBitSet bs = getBits();
    return new Filter() {
      @Override
      public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
        int offset = 0;
        SolrIndexReader r = (SolrIndexReader)reader;
        while (r.getParent() != null) {
          offset += r.getBase();
          r = r.getParent();
        }
        if (r==reader) return bs;
        final int base = offset;
        final int maxDoc = reader.maxDoc();
        final int max = base + maxDoc;   // one past the max doc in this segment.
        return new DocIdSet() {
          public DocIdSetIterator iterator() throws IOException {
            return new DocIdSetIterator() {
              int pos=base-1;
              public int doc() {
                return pos-base;
              }
              public boolean next() throws IOException {
                pos = bs.nextSetBit(pos+1);
                return pos>=0 && pos<max;
              }
              public boolean skipTo(int target) throws IOException {
                pos = bs.nextSetBit(target+base);
                return pos>=0 && pos<max;
              }
            };
          }
        };
      }
    };
  }
 }
--- a/src/java/org/apache/solr/search/SortedIntDocSet.java
+++ b/src/java/org/apache/solr/search/SortedIntDocSet.java
@ -18,6 +18,12 @@
 package org.apache.solr.search;
 import org.apache.lucene.util.OpenBitSet;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.index.IndexReader;
 import java.io.IOException;
 /**
 * <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
@ -493,6 +499,126 @@ public class SortedIntDocSet extends DocSetBase {
    return bs;
  }
  public static int findIndex(int[] arr, int value, int low, int high) {
    // binary search
    while (low <= high) {
      int mid = (low+high) >>> 1;
      int found = arr[mid];
      if (found < value) {
        low = mid+1;
      }
      else if (found > value) {
        high = mid-1;
      }
      else {
        return mid;
      }
    }
    return low;
  }
  @Override
  public Filter getTopFilter() {
    return new Filter() {
      int lastEndIdx = 0;
      @Override
      public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
        int offset = 0;
        SolrIndexReader r = (SolrIndexReader)reader;
        while (r.getParent() != null) {
          offset += r.getBase();
          r = r.getParent();
        }
        final int base = offset;
        final int maxDoc = reader.maxDoc();
        final int max = base + maxDoc;   // one past the max doc in this segment.
        int sidx = Math.max(0,lastEndIdx);
        if (sidx > 0 && docs[sidx-1] >= base) {
          // oops, the lastEndIdx isn't correct... we must have been used
          // in a multi-threaded context, or the indexreaders are being
          // used out-of-order.  start at 0.
          sidx = 0;
        }
        if (sidx < docs.length && docs[sidx] < base) {
          // if docs[sidx] is < base, we need to seek to find the real start.
          sidx = findIndex(docs, base, sidx, docs.length-1);
        }
        final int startIdx = sidx;
        // Largest possible end index is limited to the start index
        // plus the number of docs contained in the segment.  Subtract 1 since
        // the end index is inclusive.
        int eidx = Math.min(docs.length, startIdx + maxDoc) - 1;
        // find the real end
        eidx = findIndex(docs, max, startIdx, eidx) - 1;
        final int endIdx = eidx;
        lastEndIdx = endIdx;
        return new DocIdSet() {
          public DocIdSetIterator iterator() throws IOException {
            return new DocIdSetIterator() {
              int idx = startIdx;
              int doc;
              public int doc() {
                return doc - base;
              }
              public boolean next() throws IOException {
                if (idx > endIdx) return false;
                doc = docs[idx++];
                return true;
              }
              public boolean skipTo(int target) throws IOException {
                if (idx > endIdx) return false;
                target += base;
                // probe next
                doc = docs[idx++];
                if (doc >= target) return true;
                int high = endIdx;
                // TODO: probe more before resorting to binary search?
                // binary search
                while (idx <= high) {
                  int mid = (idx+high) >>> 1;
                  doc = docs[mid];
                  if (doc < target) {
                    idx = mid+1;
                  }
                  else if (doc > target) {
                    high = mid-1;
                  }
                  else {
                    idx=mid+1;
                    return true;
                  }
                }
                // low is on the insertion point...
                if (idx <= endIdx) {
                  doc = docs[idx++];
                  return true;
                } else {
                  return false;
                }
              }
            };
          }
        };
      }
    };
  }
 }
--- a/src/test/org/apache/solr/search/TestDocSet.java
+++ b/src/test/org/apache/solr/search/TestDocSet.java
@ -21,9 +21,16 @@ import junit.framework.TestCase;
 import java.util.Random;
 import java.util.Arrays;
 import java.io.IOException;
 import org.apache.lucene.util.OpenBitSet;
 import org.apache.lucene.util.OpenBitSetIterator;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.FilterIndexReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 /**
 * @version $Id$
@ -300,5 +307,118 @@ public class TestDocSet extends TestCase {
  }
  ***/
  public IndexReader dummyIndexReader(final int maxDoc) {
    IndexReader r = new FilterIndexReader(null) {
      @Override
      public int maxDoc() {
        return maxDoc;
      }
      @Override
      public boolean hasDeletions() {
        return false;
      }
      @Override
      public IndexReader[] getSequentialSubReaders() {
        return null;
      }
    };
    return r;
  }
  public IndexReader dummyMultiReader(int nSeg, int maxDoc) {
    if (nSeg==1 && rand.nextBoolean()) return dummyIndexReader(rand.nextInt(maxDoc));
    IndexReader[] subs = new IndexReader[rand.nextInt(nSeg)+1];
    for (int i=0; i<subs.length; i++) {
      subs[i] = dummyIndexReader(rand.nextInt(maxDoc));
    }
    MultiReader mr = new MultiReader(subs);
    return mr;
  }
  public void doTestIteratorEqual(DocIdSet a, DocIdSet b) throws IOException {
    DocIdSetIterator ia = a.iterator();
    DocIdSetIterator ib = b.iterator();
    // test for next() equivalence
    for(;;) {
      boolean nexta = ia.next();
      boolean nextb = ib.next();
      assertEquals(nexta, nextb);
      if (!nexta) break;
      assertEquals(ia.doc(), ib.doc());
    }
    for (int i=0; i<10; i++) {
      // test random skipTo() and next()
      ia = a.iterator();
      ib = b.iterator();
      int doc = -1;
      for (;;) {
        boolean nexta,nextb;
        if (rand.nextBoolean()) {
          nexta = ia.next();
          nextb = ib.next();
        } else {
          int target = doc + rand.nextInt(10) + 1;  // keep in mind future edge cases like probing (increase if necessary)
          nexta = ia.skipTo(target);
          nextb = ib.skipTo(target);
        }
        assertEquals(nexta, nextb);        
        if (!nexta) break;
        doc = ia.doc();
        assertEquals(doc, ib.doc());
      }
    }
  }
  public void doFilterTest(SolrIndexReader reader) throws IOException {
    OpenBitSet bs = getRandomSet(reader.maxDoc(), rand.nextInt(reader.maxDoc()+1));
    DocSet a = new BitDocSet(bs);
    DocSet b = getIntDocSet(bs);
    Filter fa = a.getTopFilter();
    Filter fb = b.getTopFilter();
    // test top-level
    DocIdSet da = fa.getDocIdSet(reader);
    DocIdSet db = fb.getDocIdSet(reader);
    doTestIteratorEqual(da, db);
    // first test in-sequence sub readers
    for (SolrIndexReader sir : reader.getLeafReaders()) {
      da = fa.getDocIdSet(sir);
      db = fb.getDocIdSet(sir);
      doTestIteratorEqual(da, db);
    }  
    int nReaders = reader.getLeafReaders().length;
    // now test out-of-sequence sub readers
    for (int i=0; i<nReaders; i++) {
      SolrIndexReader sir = reader.getLeafReaders()[rand.nextInt(nReaders)];
      da = fa.getDocIdSet(sir);
      db = fb.getDocIdSet(sir);
      doTestIteratorEqual(da, db);
    }
  }
  public void testFilter() throws IOException {
    // keeping these numbers smaller help hit more edge cases
    int maxSeg=4;
    int maxDoc=5;    // increase if future changes add more edge cases (like probing a certain distance in the bin search)
    for (int i=0; i<5000; i++) {
      IndexReader r = dummyMultiReader(maxSeg, maxDoc);
      SolrIndexReader sir = new SolrIndexReader(r, null, 0);
      doFilterTest(sir);
    }
  }
 }