SOLR-1169: SortedIntDocSet

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@776750 13f79535-47bb-0310-9956-ffa450edef68
2009-05-20 16:11:11 +00:00 · 2009-05-20 16:11:11 +00:00 · 06f159e3fb
parent d161e54136
commit 06f159e3fb
7 changed files with 626 additions and 81 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -254,6 +254,11 @@ Optimizations
 10. SOLR-1166: Speed up docset/filter generation by avoiding top-level
    score() call and iterating over leaf readers with TermDocs. (yonik)

+11. SOLR-1169: SortedIntDocSet - a new small set implementation
+    that saves memory over HashDocSet, is faster to construct, 
+    is ordered for easier impelemntation of skipTo, and is faster
+    in the general case. (yonik)
+

 Bug Fixes
 ----------------------
--- a/example/solr/conf/solrconfig.xml
+++ b/example/solr/conf/solrconfig.xml
@ -306,12 +306,6 @@
         queryResultCache. -->
    <queryResultMaxDocsCached>200</queryResultMaxDocsCached>

-    <!-- This entry enables an int hash representation for filters (DocSets)
-         when the number of items in the set is less than maxSize.  For smaller
-         sets, this representation is more memory efficient, more efficient to
-         iterate over, and faster to take intersections.  -->
-    <HashDocSet maxSize="3000" loadFactor="0.75"/>
-
    <!-- a newSearcher event is fired whenever a new searcher is being prepared
         and there is a current searcher handling requests (aka registered). -->
    <!-- QuerySenderListener takes an array of NamedList and executes a
--- a/src/java/org/apache/solr/search/DocSet.java
+++ b/src/java/org/apache/solr/search/DocSet.java
@ -189,10 +189,9 @@ abstract class DocSetBase implements DocSet {
  };

  public DocSet intersection(DocSet other) {
-    // intersection is overloaded in HashDocSet to be more
-    // efficient, so if "other" is a HashDocSet, dispatch off
-    // of it instead.
-    if (other instanceof HashDocSet) {
+    // intersection is overloaded in the smaller DocSets to be more
+    // efficient, so dispatch off of it instead.
+    if (!(other instanceof BitDocSet)) {
      return other.intersection(this);
    }

@ -209,10 +208,9 @@ abstract class DocSetBase implements DocSet {
  }

  public int intersectionSize(DocSet other) {
-    // intersectionSize is overloaded in HashDocSet to be more
-    // efficient, so if "other" is a HashDocSet, dispatch off
-    // of it instead.
-    if (other instanceof HashDocSet) {
+    // intersection is overloaded in the smaller DocSets to be more
+    // efficient, so dispatch off of it instead.
+    if (!(other instanceof BitDocSet)) {
      return other.intersectionSize(this);
    }
    // less efficient way: do the intersection then get it's size
--- a/src/java/org/apache/solr/search/DocSetHitCollector.java
+++ b/src/java/org/apache/solr/search/DocSetHitCollector.java
@ -30,31 +30,26 @@ import java.io.IOException;
 */

 final class DocSetHitCollector extends HitCollector {
-
-  final float HASHSET_INVERSE_LOAD_FACTOR;
-  final int HASHDOCSET_MAXSIZE;
-
  int pos=0;
  OpenBitSet bits;
  final int maxDoc;
+  final int smallSetSize;

  // in case there aren't that many hits, we may not want a very sparse
  // bit array.  Optimistically collect the first few docs in an array
  // in case there are only a few.
  final int[] scratch;

-  // todo - could pass in bitset and an operation also...
-  DocSetHitCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
+  DocSetHitCollector(int smallSetSize, int maxDoc) {
+    this.smallSetSize = smallSetSize;
    this.maxDoc = maxDoc;
-    HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
-    HASHDOCSET_MAXSIZE = maxSize;
-    scratch = new int[HASHDOCSET_MAXSIZE];
+    this.scratch = new int[smallSetSize];
  }

  public void collect(int doc, float score) {
    // optimistically collect the first docs in an array
    // in case the total number will be small enough to represent
-    // as a HashDocSet() instead...
+    // as a small set like SortedIntDocSet instead...
    // Storing in this array will be quicker to convert
    // than scanning through a potentially huge bit vector.
    // FUTURE: when search methods all start returning docs in order, maybe
@ -73,7 +68,8 @@ final class DocSetHitCollector extends HitCollector {

  public DocSet getDocSet() {
    if (pos<=scratch.length) {
-      return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
+      // assumes docs were collected in sorted order!     
+      return new SortedIntDocSet(scratch, pos);
    } else {
      // set the bits for ids that were collected in the array
      for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
@ -84,33 +80,27 @@ final class DocSetHitCollector extends HitCollector {


 class DocSetCollector extends Collector {
-
-  final float HASHSET_INVERSE_LOAD_FACTOR;
-  final int HASHDOCSET_MAXSIZE;
-
  int pos=0;
  OpenBitSet bits;
  final int maxDoc;
-  int base=0;
+  final int smallSetSize;
+  int base;

  // in case there aren't that many hits, we may not want a very sparse
  // bit array.  Optimistically collect the first few docs in an array
  // in case there are only a few.
  final int[] scratch;

-  // todo - could pass in bitset and an operation also...
-  DocSetCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
+  DocSetCollector(int smallSetSize, int maxDoc) {
+    this.smallSetSize = smallSetSize;
    this.maxDoc = maxDoc;
-    HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
-    HASHDOCSET_MAXSIZE = maxSize;
-    scratch = new int[HASHDOCSET_MAXSIZE];
+    this.scratch = new int[smallSetSize];
  }
-
  public void collect(int doc) {
    doc += base;
    // optimistically collect the first docs in an array
    // in case the total number will be small enough to represent
-    // as a HashDocSet() instead...
+    // as a small set like SortedIntDocSet instead...
    // Storing in this array will be quicker to convert
    // than scanning through a potentially huge bit vector.
    // FUTURE: when search methods all start returning docs in order, maybe
@ -129,7 +119,8 @@ class DocSetCollector extends Collector {

  public DocSet getDocSet() {
    if (pos<=scratch.length) {
-      return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
+      // assumes docs were collected in sorted order!     
+      return new SortedIntDocSet(scratch, pos);
    } else {
      // set the bits for ids that were collected in the array
      for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
--- a/src/java/org/apache/solr/search/SolrIndexSearcher.java
+++ b/src/java/org/apache/solr/search/SolrIndexSearcher.java
@ -82,9 +82,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {

  private final LuceneQueryOptimizer optimizer;
  
-  private final float HASHSET_INVERSE_LOAD_FACTOR;
-  private final int HASHDOCSET_MAXSIZE;
-  
  // map of generic caches - not synchronized since it's read-only after the constructor.
  private final HashMap<String, SolrCache> cacheMap;
  private static final HashMap<String, SolrCache> noGenericCaches=new HashMap<String,SolrCache>(0);
@ -186,10 +183,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
    }
    optimizer = solrConfig.filtOptEnabled ? new LuceneQueryOptimizer(solrConfig.filtOptCacheSize,solrConfig.filtOptThreshold) : null;

-    // for DocSets
-    HASHSET_INVERSE_LOAD_FACTOR = solrConfig.hashSetInverseLoadFactor;
-    HASHDOCSET_MAXSIZE = solrConfig.hashDocSetMaxSize;
-
    fieldNames = r.getFieldNames(IndexReader.FieldOption.ALL);
  }

@ -628,7 +621,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
  // query must be positive
  protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
    if (filter==null) {
-      DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
+      DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
      if (query instanceof TermQuery) {
        Term t = ((TermQuery)query).getTerm();
        SolrIndexReader[] readers = reader.getLeafReaders();
@ -656,7 +649,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {

    } else {
      // FUTURE: if the filter is sorted by docid, could use skipTo (SkipQueryFilter)
-      final DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
+      final DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
      final DocSet filt = filter;
      super.search(query, null, new Collector() {
        int base = 0;
@ -1131,7 +1124,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
    float maxScore;
    int[] ids;
    float[] scores;
-    final DocSetHitCollector setHC = new DocSetHitCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
+    final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc()>>6, maxDoc());
    final HitCollector collector = ( cmd.getTimeAllowed() > 0 ) ? new TimeLimitedCollector( setHC, cmd.getTimeAllowed() ) : setHC;

    Query query = QueryUtils.makeQueryable(cmd.getQuery());
--- a/src/java/org/apache/solr/search/SortedIntDocSet.java
+++ b/src/java/org/apache/solr/search/SortedIntDocSet.java
@ -0,0 +1,498 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search;
+
+import org.apache.lucene.util.OpenBitSet;
+
+/**
+ * <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
+ */
+public class SortedIntDocSet extends DocSetBase {
+  protected final int[] docs;
+
+  public SortedIntDocSet(int[] docs) {
+    this.docs = docs;
+  }
+
+  public SortedIntDocSet(int[] docs, int len) {
+    this(shrink(docs,len));
+  }
+
+  public int[] getDocs() { return docs; }
+
+  public int size()      { return docs.length; }
+
+  public long memSize() {
+    return (docs.length<<2)+8;
+  }
+
+  public static int[] zeroInts = new int[0];
+  public static SortedIntDocSet zero = new SortedIntDocSet(zeroInts);
+
+  public static int[] shrink(int[] arr, int newSize) {
+    if (arr.length == newSize) return arr;
+    int[] newArr = new int[newSize];
+    System.arraycopy(arr, 0, newArr, 0, newSize);
+    return newArr;
+  }
+
+  public static int intersectionSize(int[] smallerSortedList, int[] biggerSortedList) {
+    final int a[] = smallerSortedList;
+    final int b[] = biggerSortedList;
+
+    // The next doc we are looking for will be much closer to the last position we tried
+    // than it will be to the midpoint between last and high... so probe ahead using
+    // a function of the ratio of the sizes of the sets.
+    int step = (b.length/a.length)+1;
+
+    // Since the majority of probes should be misses, we'll already be above the last probe
+    // and shouldn't need to move larger than the step size on average to step over our target (and thus lower
+    // the high upper bound a lot.)... but if we don't go over our target, it's a big miss... so double it.
+    step = step + step;
+
+    // FUTURE: come up with a density such that target * density == likely position?
+    // then check step on one side or the other?
+    // (density could be cached in the DocSet)... length/maxDoc
+
+    // FUTURE: try partitioning like a sort algorithm.  Pick the midpoint of the big
+    // array, find where that should be in the small array, and then recurse with
+    // the top and bottom half of both arrays until they are small enough to use
+    // a fallback insersection method.
+    // NOTE: I tried this and it worked, but it was actually slower than this current
+    // highly optimized approach.
+
+    int icount = 0;
+    int low = 0;
+    int max = b.length-1;
+
+    for (int i=0; i<a.length; i++) {
+      int doca = a[i];
+
+      int high = max;
+
+      int probe = low + step;     // 40% improvement!
+
+      // short linear probe to see if we can drop the high pointer in one big jump.
+      if (probe<high) {
+        if (b[probe]>=doca) {
+          // success!  we cut down the upper bound by a lot in one step!
+          high=probe;
+        } else {
+          // relative failure... we get to move the low pointer, but not my much
+          low=probe+1;
+
+          // reprobe worth it? it appears so!
+          probe = low + step;
+          if (probe<high) {
+            if (b[probe]>=doca) {
+              high=probe;
+            } else {
+              low=probe+1;
+            }
+          }
+        }
+      }
+
+      // binary search the rest of the way
+      while (low <= high) {
+        int mid = (low+high) >>> 1;
+        int docb = b[mid];
+
+        if (docb < doca) {
+          low = mid+1;
+        }
+        else if (docb > doca) {
+          high = mid-1;
+        }
+        else {
+          icount++;
+          low = mid+1;  // found it, so start at next element
+          break;
+        }
+      }
+      // Didn't find it... low is now positioned on the insertion point,
+      // which is higher than what we were looking for, so continue using
+      // the same low point.
+    }
+
+    return icount;
+  }
+
+  public int intersectionSize(DocSet other) {
+    if (!(other instanceof SortedIntDocSet)) {
+      // assume other implementations are better at random access than we are,
+      // true of BitDocSet and HashDocSet.
+      int icount = 0;
+      for (int i=0; i<docs.length; i++) {
+        if (other.exists(docs[i])) icount++;
+      }
+      return icount;
+    }
+
+    // make "a" the smaller set.
+    int[] otherDocs = ((SortedIntDocSet)other).docs;
+    final int[] a = docs.length < otherDocs.length ? docs : otherDocs;
+    final int[] b = docs.length < otherDocs.length ? otherDocs : docs;
+
+    if (a.length==0) return 0;
+
+    // if b is 8 times bigger than a, use the modified binary search.
+    if ((b.length>>3) >= a.length) {
+      return intersectionSize(a,b);
+    }
+
+    // if they are close in size, just do a linear walk of both.
+    int icount=0;
+    int i=0,j=0;
+    int doca=a[i],docb=b[j];
+    for(;;) {
+      // switch on the sign bit somehow?  Hopefull JVM is smart enough to just test once.
+
+      // Since set a is less dense then set b, doca is likely to be greater than docb so
+      // check that case first.  This resulted in a 13% speedup.
+      if (doca > docb) {
+        if (++j >= b.length) break;
+        docb=b[j];
+      } else if (doca < docb) {
+        if (++i >= a.length) break;
+        doca=a[i];
+      } else {
+        icount++;
+        if (++i >= a.length) break;
+        doca=a[i];
+        if (++j >= b.length) break;
+        docb=b[j];
+      }
+    }
+    return icount;
+  }
+
+
+  /** puts the intersection of a and b into the target array and returns the size */
+  public static int intersection(int a[], int lena, int b[], int lenb, int[] target) {
+    if (lena > lenb) {
+      int ti=lena; lena=lenb; lenb=ti;
+      int[] ta=a; a=b; b=ta;
+    }
+
+    if (lena==0) return 0;
+
+
+    // if b is 8 times bigger than a, use the modified binary search.
+    if ((lenb>>3) >= lena) {
+      return intersectionBinarySearch(a, lena, b, lenb, target);
+    }
+
+    int icount=0;
+    int i=0,j=0;
+    int doca=a[i],docb=b[j];
+    for(;;) {
+      if (doca > docb) {
+        if (++j >= lenb) break;
+        docb=b[j];
+      } else if (doca < docb) {
+        if (++i >= lena) break;
+        doca=a[i];
+      } else {
+        target[icount++] = doca;
+        if (++i >= lena) break;
+        doca=a[i];
+        if (++j >= lenb) break;
+        docb=b[j];
+      }
+    }
+    return icount;
+  }
+
+  /** Puts the intersection of a and b into the target array and returns the size.
+   * lena should be smaller than lenb */
+  protected static int intersectionBinarySearch(int[] a, int lena, int[] b, int lenb, int[] target) {
+    int step = (lenb/lena)+1;
+    step = step + step;
+
+
+    int icount = 0;
+    int low = 0;
+    int max = lenb-1;
+
+    for (int i=0; i<lena; i++) {
+      int doca = a[i];
+
+      int high = max;
+
+      int probe = low + step;     // 40% improvement!
+
+      // short linear probe to see if we can drop the high pointer in one big jump.
+      if (probe<high) {
+        if (b[probe]>=doca) {
+          // success!  we cut down the upper bound by a lot in one step!
+          high=probe;
+        } else {
+          // relative failure... we get to move the low pointer, but not my much
+          low=probe+1;
+
+          // reprobe worth it? it appears so!
+          probe = low + step;
+          if (probe<high) {
+            if (b[probe]>=doca) {
+              high=probe;
+            } else {
+              low=probe+1;
+            }
+          }
+        }
+      }
+
+
+      // binary search
+      while (low <= high) {
+        int mid = (low+high) >>> 1;
+        int docb = b[mid];
+
+        if (docb < doca) {
+          low = mid+1;
+        }
+        else if (docb > doca) {
+          high = mid-1;
+        }
+        else {
+          target[icount++] = doca;
+          low = mid+1;  // found it, so start at next element
+          break;
+        }
+      }
+      // Didn't find it... low is now positioned on the insertion point,
+      // which is higher than what we were looking for, so continue using
+      // the same low point.
+    }
+
+    return icount;
+  }
+
+  @Override
+  public DocSet intersection(DocSet other) {
+    if (!(other instanceof SortedIntDocSet)) {
+      int icount = 0;
+      int arr[] = new int[docs.length];
+      for (int i=0; i<docs.length; i++) {
+        int doc = docs[i];
+        if (other.exists(doc)) arr[icount++] = doc;
+      }
+      return new SortedIntDocSet(arr,icount);
+    }
+
+    int[] otherDocs = ((SortedIntDocSet)other).docs;
+    int maxsz = Math.min(docs.length, otherDocs.length);
+    int[] arr = new int[maxsz];
+    int sz = intersection(docs, docs.length, otherDocs, otherDocs.length, arr);
+    return new SortedIntDocSet(arr,sz);
+  }
+
+
+  protected static int andNotBinarySearch(int a[], int lena, int b[], int lenb, int[] target) {
+   int step = (lenb/lena)+1;
+    step = step + step;
+
+
+    int count = 0;
+    int low = 0;
+    int max = lenb-1;
+
+    outer:
+    for (int i=0; i<lena; i++) {
+      int doca = a[i];
+
+      int high = max;
+
+      int probe = low + step;     // 40% improvement!
+
+      // short linear probe to see if we can drop the high pointer in one big jump.
+      if (probe<high) {
+        if (b[probe]>=doca) {
+          // success!  we cut down the upper bound by a lot in one step!
+          high=probe;
+        } else {
+          // relative failure... we get to move the low pointer, but not my much
+          low=probe+1;
+
+          // reprobe worth it? it appears so!
+          probe = low + step;
+          if (probe<high) {
+            if (b[probe]>=doca) {
+              high=probe;
+            } else {
+              low=probe+1;
+            }
+          }
+        }
+      }
+
+
+      // binary search
+      while (low <= high) {
+        int mid = (low+high) >>> 1;
+        int docb = b[mid];
+
+        if (docb < doca) {
+          low = mid+1;
+        }
+        else if (docb > doca) {
+          high = mid-1;
+        }
+        else {
+          low = mid+1;  // found it, so start at next element
+          continue outer;
+        }
+      }
+      // Didn't find it... low is now positioned on the insertion point,
+      // which is higher than what we were looking for, so continue using
+      // the same low point.
+      target[count++] = doca;
+    }
+
+    return count;
+  }
+
+    /** puts the intersection of a and not b into the target array and returns the size */
+  public static int andNot(int a[], int lena, int b[], int lenb, int[] target) {
+    if (lena==0) return 0;
+    if (lenb==0) {
+      System.arraycopy(a,0,target,0,lena);
+      return lena;
+    }
+
+    // if b is 8 times bigger than a, use the modified binary search.
+    if ((lenb>>3) >= lena) {
+      return andNotBinarySearch(a, lena, b, lenb, target);
+    }
+
+    int count=0;
+    int i=0,j=0;
+    int doca=a[i],docb=b[j];
+    for(;;) {
+      if (doca > docb) {
+        if (++j >= lenb) break;
+        docb=b[j];
+      } else if (doca < docb) {
+        target[count++] = doca;
+        if (++i >= lena) break;
+        doca=a[i];
+      } else {
+        if (++i >= lena) break;
+        doca=a[i];
+        if (++j >= lenb) break;
+        docb=b[j];
+      }
+    }
+
+    int leftover=lena - i;
+
+    if (leftover > 0) {
+      System.arraycopy(a,i,target,count,leftover);
+      count += leftover;
+    }
+
+    return count;
+  }
+
+  @Override
+  public DocSet andNot(DocSet other) {
+    if (other.size()==0) return this;
+
+    if (!(other instanceof SortedIntDocSet)) {
+      int count = 0;
+      int arr[] = new int[docs.length];
+      for (int i=0; i<docs.length; i++) {
+        int doc = docs[i];
+        if (!other.exists(doc)) arr[count++] = doc;
+      }
+      return new SortedIntDocSet(arr,count);
+    }
+
+    int[] otherDocs = ((SortedIntDocSet)other).docs;
+    int[] arr = new int[docs.length];
+    int sz = andNot(docs, docs.length, otherDocs, otherDocs.length, arr);
+    return new SortedIntDocSet(arr,sz);
+  }
+
+
+  public boolean exists(int doc) {
+    // this could be faster by estimating where in the list the doc is likely to appear,
+    // but we should get away from using exists() anyway.
+    int low = 0;
+    int high = docs.length-1;
+    // binary search
+    while (low <= high) {
+      int mid = (low+high) >>> 1;
+      int docb = docs[mid];
+
+      if (docb < doc) {
+        low = mid+1;
+      }
+      else if (docb > doc) {
+        high = mid-1;
+      }
+      else {
+        return true;
+      }
+    }
+    return false;
+  }
+  
+
+  public DocIterator iterator() {
+    return new DocIterator() {
+      int pos=0;
+      public boolean hasNext() {
+        return pos < docs.length;
+      }
+
+      public Integer next() {
+        return nextDoc();
+      }
+
+      /**
+       * The remove  operation is not supported by this Iterator.
+       */
+      public void remove() {
+        throw new UnsupportedOperationException("The remove  operation is not supported by this Iterator.");
+      }
+
+      public int nextDoc() {
+        return docs[pos++];
+      }
+
+      public float score() {
+        return 0.0f;
+      }
+    };
+  }
+  
+  @Override
+  public OpenBitSet getBits() {
+    int maxDoc = size() > 0 ? docs[size()-1] : 0;
+    OpenBitSet bs = new OpenBitSet(maxDoc+1);
+    for (int doc : docs) {
+      bs.fastSet(doc);
+    }
+    return bs;
+  }
+
+}
+
+
--- a/src/test/org/apache/solr/search/TestDocSet.java
+++ b/src/test/org/apache/solr/search/TestDocSet.java
@ -20,6 +20,7 @@ package org.apache.solr.search;
 import junit.framework.TestCase;

 import java.util.Random;
+import java.util.Arrays;

 import org.apache.lucene.util.OpenBitSet;
 import org.apache.lucene.util.OpenBitSetIterator;
@ -49,41 +50,75 @@ public class TestDocSet extends TestCase {
    return new HashDocSet(docs,0,docs.length);
  }

+  public DocSet getIntDocSet(OpenBitSet bs) {
+    int[] docs = new int[(int)bs.cardinality()];
+    OpenBitSetIterator iter = new OpenBitSetIterator(bs);
+    for (int i=0; i<docs.length; i++) {
+      docs[i] = iter.nextDoc();
+    }
+    return new SortedIntDocSet(docs);
+  }
+
+
  public DocSet getBitDocSet(OpenBitSet bs) {
    return new BitDocSet(bs);
  }

  public DocSet getDocSet(OpenBitSet bs) {
-    return rand.nextInt(2)==0 ? getHashDocSet(bs) : getBitDocSet(bs);
+    switch(rand.nextInt(3)) {
+      case 0: return getIntDocSet(bs);
+      case 1: return getHashDocSet(bs);
+      case 2: return getBitDocSet(bs);    
+    }
+    return null;
  }

  public void checkEqual(OpenBitSet bs, DocSet set) {
    for (int i=0; i<bs.capacity(); i++) {
      assertEquals(bs.get(i), set.exists(i));
    }
+    assertEquals(bs.cardinality(), set.size());
+  }
+
+  public void iter(DocSet d1, DocSet d2) {
+    // HashDocSet doesn't iterate in order.
+    if (d1 instanceof HashDocSet || d2 instanceof HashDocSet) return;
+
+    DocIterator i1 = d1.iterator();
+    DocIterator i2 = d2.iterator();
+
+    assert(i1.hasNext() == i2.hasNext());
+
+    for(;;) {
+      boolean b1 = i1.hasNext();
+      boolean b2 = i2.hasNext();
+      assertEquals(b1,b2);
+      if (!b1) break;
+      assertEquals(i1.nextDoc(), i2.nextDoc());
+    }
  }

  protected void doSingle(int maxSize) {
    int sz = rand.nextInt(maxSize+1);
    int sz2 = rand.nextInt(maxSize);
-    OpenBitSet a1 = getRandomSet(sz, rand.nextInt(sz+1));
-    OpenBitSet a2 = getRandomSet(sz, rand.nextInt(sz2+1));
+    OpenBitSet bs1 = getRandomSet(sz, rand.nextInt(sz+1));
+    OpenBitSet bs2 = getRandomSet(sz, rand.nextInt(sz2+1));

-    DocSet b1 = getDocSet(a1);
-    DocSet b2 = getDocSet(a2);
+    DocSet a1 = new BitDocSet(bs1);
+    DocSet a2 = new BitDocSet(bs2);
+    DocSet b1 = getDocSet(bs1);
+    DocSet b2 = getDocSet(bs2);

-    // System.out.println("b1="+b1+", b2="+b2);
+    checkEqual(bs1,b1);
+    checkEqual(bs2,b2);

-    assertEquals((int)a1.cardinality(), b1.size());
-    assertEquals((int)a2.cardinality(), b2.size());
+    iter(a1,b1);
+    iter(a2,b2);

-    checkEqual(a1,b1);
-    checkEqual(a2,b2);
-
-    OpenBitSet a_and = (OpenBitSet)a1.clone(); a_and.and(a2);
-    OpenBitSet a_or = (OpenBitSet)a1.clone(); a_or.or(a2);
-    // OpenBitSet a_xor = (OpenBitSet)a1.clone(); a_xor.xor(a2);
-    OpenBitSet a_andn = (OpenBitSet)a1.clone(); a_andn.andNot(a2);
+    OpenBitSet a_and = (OpenBitSet) bs1.clone(); a_and.and(bs2);
+    OpenBitSet a_or = (OpenBitSet) bs1.clone(); a_or.or(bs2);
+    // OpenBitSet a_xor = (OpenBitSet)bs1.clone(); a_xor.xor(bs2);
+    OpenBitSet a_andn = (OpenBitSet) bs1.clone(); a_andn.andNot(bs2);

    checkEqual(a_and, b1.intersection(b2));
    checkEqual(a_or, b1.union(b2));
@ -102,12 +137,15 @@ public class TestDocSet extends TestCase {
  }

  public void testRandomDocSets() {
-    doMany(300, 5000);
+    // Make the size big enough to go over certain limits (such as one set
+    // being 8 times the size of another in the int set, or going over 2 times
+    // 64 bits for the bit doc set.  Smaller sets can hit more boundary conditions though.
+
+    doMany(130, 10000);
+    //doMany(130, 1000000);
  }

-
-  public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
-    int n = rand.nextInt(maxSetSize);
+  public DocSet getRandomDocSet(int n, int maxDoc) {
    OpenBitSet obs = new OpenBitSet(maxDoc);
    int[] a = new int[n];
    for (int i=0; i<n; i++) {
@ -118,14 +156,29 @@ public class TestDocSet extends TestCase {
        break;
      }
    }
-    return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
+
+    if (n <= smallSetCuttoff) {
+      if (smallSetType ==0) {
+        Arrays.sort(a);
+        return new SortedIntDocSet(a);
+      } else if (smallSetType ==1) {
+        Arrays.sort(a);
+        return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
+      }
+    }
+
+    return new BitDocSet(obs, n);
  }

-  public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
+  public DocSet[] getRandomSets(int nSets, int minSetSize, int maxSetSize, int maxDoc) {
    DocSet[] sets = new DocSet[nSets];

    for (int i=0; i<nSets; i++) {
-      sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
+      int sz;
+      sz = rand.nextInt(maxSetSize-minSetSize+1)+minSetSize;
+      // different distribution
+      // sz = (maxSetSize+1)/(rand.nextInt(maxSetSize)+1) + minSetSize;
+      sets[i] = getRandomDocSet(sz,maxDoc);
    }

    return sets;
@ -160,30 +213,43 @@ public class TestDocSet extends TestCase {
  }
  ***/

+  public static int smallSetType = 0;  // 0==sortedint, 1==hash, 2==openbitset
+  public static int smallSetCuttoff=3000;
+
  /***
  public void testIntersectionSizePerformance() {
-    loadfactor=.75f;
-    rand=new Random(12345);  // make deterministic
-    int maxSetsize=4000;
-    int nSets=128;
-    int iter=10;
+    loadfactor=.75f; // for HashDocSet    
+    rand=new Random(1);  // make deterministic
+
+    int minBigSetSize=1,maxBigSetSize=30000;
+    int minSmallSetSize=1,maxSmallSetSize=30000;
+    int nSets=1024;
+    int iter=1;
    int maxDoc=1000000;
-    DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
+
+
+    smallSetCuttoff = maxDoc>>6; // break even for SortedIntSet is /32... but /64 is better for performance
+    // smallSetCuttoff = maxDoc;
+
+
+    DocSet[] bigsets = getRandomSets(nSets, minBigSetSize, maxBigSetSize, maxDoc);
+    DocSet[] smallsets = getRandomSets(nSets, minSmallSetSize, maxSmallSetSize, maxDoc);
    int ret=0;
    long start=System.currentTimeMillis();
    for (int i=0; i<iter; i++) {
-      for (DocSet s1 : sets) {
-        for (DocSet s2 : sets) {
+      for (DocSet s1 : bigsets) {
+        for (DocSet s2 : smallsets) {
          ret += s1.intersectionSize(s2);
        }
      }
    }
    long end=System.currentTimeMillis();
-    System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
-    if (ret==-1)System.out.println("wow!");
+    System.out.println("intersectionSizePerformance="+(end-start)+" ms");
+    System.out.println("ret="+ret);
  }
+   ***/

-
+  /****
  public void testExistsPerformance() {
    loadfactor=.75f;
    rand=new Random(12345);  // make deterministic