new OpenBitSet, BitDocSet changes to use it, HashDocSet size params moved to the DocSetHitCollector: SOLR-15

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@413324 13f79535-47bb-0310-9956-ffa450edef68
2006-06-10 16:05:12 +00:00 · 2006-06-10 16:05:12 +00:00 · e63135a59f
parent 3c7c44fc11
commit e63135a59f
18 changed files with 2217 additions and 170 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -29,6 +29,9 @@ Optimizations
    single lucene query.
 2. BitDocSet.intersectionSize(HashDocSet) no longer generates an intermediate
    set
+ 3. OpenBitSet completed, replaces BitSet as the implementation for BitDocSet.
+    Iteration is faster, and BitDocSet.intersectionSize(BitDocSet)
+    is between 3 and 4 times faster. (yonik, SOLR-15)

 Bug Fixes
 1. Fixed delete-by-id for field types who's indexed form is different
--- a/build.xml
+++ b/build.xml
@ -174,7 +174,7 @@
          depends="compileTests" >
    <!-- DEPRECATED: no description so it doesn't show up in project help -->
    <java classname="SolrTest" fork="true" dir="src/apps/SolrTest" failonerror="true">
-     <arg line="-test newtest.txt"/>
+     <arg line="-test newtest.txt -qargs qt=test"/>
     <classpath>
       <path refid="test.run.classpath" />
     </classpath>
--- a/src/apps/SolrTest/solr/conf/solrconfig.xml
+++ b/src/apps/SolrTest/solr/conf/solrconfig.xml
@ -118,7 +118,8 @@

    <queryResultWindowSize>10</queryResultWindowSize>

-    <HashDocSet maxSize="3000" loadFactor="0.75"/>
+    <!-- set maxSize low to exercise both types of sets -->
+    <HashDocSet maxSize="3" loadFactor="0.75"/>


    <!-- boolToFilterOptimizer converts boolean clauses with zero boost
--- a/src/java/org/apache/solr/schema/IndexSchema.java
+++ b/src/java/org/apache/solr/schema/IndexSchema.java
@ -360,9 +360,10 @@ public final class IndexSchema {
    if (node==null) {
      log.warning("no default search field specified in schema.");
    } else {
-      String defName=node.getNodeValue().trim();
-      defaultSearchFieldName = getIndexedField(defName)!=null ? defName : null;
-      log.info("default search field is "+defName);
+      defaultSearchFieldName=node.getNodeValue().trim();
+      // throw exception if specified, but not found or not indexed
+      if (defaultSearchFieldName!=null) getIndexedField(defaultSearchFieldName);
+      log.info("default search field is "+defaultSearchFieldName);
    }

    node = (Node) xpath.evaluate("/schema/uniqueKey/text()", document, XPathConstants.NODE);
--- a/src/java/org/apache/solr/search/BitDocSet.java
+++ b/src/java/org/apache/solr/search/BitDocSet.java
@ -16,7 +16,8 @@

 package org.apache.solr.search;

-import java.util.BitSet;
+import org.apache.solr.util.OpenBitSet;
+import org.apache.solr.util.BitSetIterator;

 /**
 * <code>BitDocSet</code> represents an unordered set of Lucene Document Ids
@ -27,23 +28,29 @@ import java.util.BitSet;
 * @since solr 0.9
 */
 public class BitDocSet extends DocSetBase {
-  final BitSet bits;
+  final OpenBitSet bits;
  int size;    // number of docs in the set (cached for perf)

  public BitDocSet() {
-    bits = new BitSet();
+    bits = new OpenBitSet();
  }

-  public BitDocSet(BitSet bits) {
+  /** Construct a BitDocSet.
+   * The capacity of the OpenBitSet should be at least maxDoc() */
+  public BitDocSet(OpenBitSet bits) {
    this.bits = bits;
    size=-1;
  }

-  public BitDocSet(BitSet bits, int size) {
+  /** Construct a BitDocSet, and provides the number of set bits.
+   * The capacity of the OpenBitSet should be at least maxDoc()
+   */
+  public BitDocSet(OpenBitSet bits, int size) {
    this.bits = bits;
    this.size = size;
  }

+  /*** DocIterator using nextSetBit()
  public DocIterator iterator() {
    return new DocIterator() {
      int pos=bits.nextSetBit(0);
@ -70,12 +77,42 @@ public class BitDocSet extends DocSetBase {
      }
    };
  }
+  ***/
+
+  public DocIterator iterator() {
+    return new DocIterator() {
+      private final BitSetIterator iter = new BitSetIterator(bits);
+      private int pos = iter.next();
+      public boolean hasNext() {
+        return pos>=0;
+      }
+
+      public Integer next() {
+        return nextDoc();
+      }
+
+      public void remove() {
+        bits.clear(pos);
+      }
+
+      public int nextDoc() {
+        int old=pos;
+        pos=iter.next();
+        return old;
+      }
+
+      public float score() {
+        return 0.0f;
+      }
+    };
+  }
+

  /**
   *
-   * @return the <b>internal</b> BitSet that should <b>not</b> be modified.
+   * @return the <b>internal</b> OpenBitSet that should <b>not</b> be modified.
   */
-  public BitSet getBits() {
+  public OpenBitSet getBits() {
    return bits;
  }

@ -91,7 +128,7 @@ public class BitDocSet extends DocSetBase {

  public int size() {
    if (size!=-1) return size;
-    return size=bits.cardinality();
+    return size=(int)bits.cardinality();
  }

  /**
@ -106,7 +143,25 @@ public class BitDocSet extends DocSetBase {
    return bits.get(doc);
  }

+  public int intersectionSize(DocSet other) {
+    if (other instanceof BitDocSet) {
+      return (int)OpenBitSet.intersectionCount(this.bits, ((BitDocSet)other).bits);
+    } else {
+      // they had better not call us back!
+      return other.intersectionSize(this);
+    }
+  }
+
+  public int unionSize(DocSet other) {
+    if (other instanceof BitDocSet) {
+      return (int)OpenBitSet.unionCount(this.bits, ((BitDocSet)other).bits);
+    } else {
+      // they had better not call us back!
+      return other.unionSize(this);
+    }
+  }
+
  public long memSize() {
-    return (bits.size() >> 3) + 16;
+    return (bits.getBits().length << 3) + 16;
  }
 }
--- a/src/java/org/apache/solr/search/DocSet.java
+++ b/src/java/org/apache/solr/search/DocSet.java
@ -17,6 +17,7 @@
 package org.apache.solr.search;

 import org.apache.solr.core.SolrException;
+import org.apache.solr.util.OpenBitSet;

 import java.util.BitSet;

@ -83,10 +84,10 @@ public interface DocSet /* extends Collection<Integer> */ {
   * a SolrIndexSearcher method, it's not safe to modify the BitSet.
   *
   * @return
-   * A BitSet with the bit number of every docid set in the set.
+   * An OpenBitSet with the bit number of every docid set in the set.
   */
  @Deprecated
-  public BitSet getBits();
+  public OpenBitSet getBits();

  /**
   * Returns the approximate amount of memory taken by this DocSet.
@ -168,8 +169,8 @@ abstract class DocSetBase implements DocSet {
   *
   * @see BitDocSet#getBits
   */
-  public BitSet getBits() {
-    BitSet bits = new BitSet();
+  public OpenBitSet getBits() {
+    OpenBitSet bits = new OpenBitSet();
    for (DocIterator iter = iterator(); iter.hasNext();) {
      bits.set(iter.nextDoc());
    }
@ -185,13 +186,13 @@ abstract class DocSetBase implements DocSet {
    }

    // Default... handle with bitsets.
-    BitSet newbits = (BitSet)(this.getBits().clone());
+    OpenBitSet newbits = (OpenBitSet)(this.getBits().clone());
    newbits.and(other.getBits());
    return new BitDocSet(newbits);
  }

  public DocSet union(DocSet other) {
-    BitSet newbits = (BitSet)(this.getBits().clone());
+    OpenBitSet newbits = (OpenBitSet)(this.getBits().clone());
    newbits.or(other.getBits());
    return new BitDocSet(newbits);
  }
@ -207,12 +208,11 @@ abstract class DocSetBase implements DocSet {
    return intersection(other).size();
  }

-  // TODO: more efficient implementations
+  // TODO: do an efficient implementation
  public int unionSize(DocSet other) {
    return union(other).size();
  }

-
 }


--- a/src/java/org/apache/solr/search/DocSetHitCollector.java
+++ b/src/java/org/apache/solr/search/DocSetHitCollector.java
@ -1,8 +1,8 @@
 package org.apache.solr.search;

 import org.apache.lucene.search.HitCollector;
-
-import java.util.BitSet;
+import org.apache.solr.util.OpenBitSet;
+import org.apache.solr.core.SolrConfig;

 /**
 * @author yonik
@ -10,15 +10,18 @@ import java.util.BitSet;
 */

 final class DocSetHitCollector extends HitCollector {
+
+  static float HASHSET_INVERSE_LOAD_FACTOR = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
+  static int HASHDOCSET_MAXSIZE= SolrConfig.config.getInt("//HashDocSet/@maxSize",-1);
+
  int pos=0;
-  BitSet bits;
+  OpenBitSet bits;
  final int maxDoc;

  // in case there aren't that many hits, we may not want a very sparse
  // bit array.  Optimistically collect the first few docs in an array
  // in case there are only a few.
-  static final int ARRAY_COLLECT_SZ=HashDocSet.MAX_SIZE;
-  final int[] scratch = ARRAY_COLLECT_SZ>0 ? new int[ARRAY_COLLECT_SZ] : null;
+  final int[] scratch = new int[HASHDOCSET_MAXSIZE];

  // todo - could pass in bitset and an operation also...
  DocSetHitCollector(int maxDoc) {
@ -33,24 +36,24 @@ final class DocSetHitCollector extends HitCollector {
    // than scanning through a potentially huge bit vector.
    // FUTURE: when search methods all start returning docs in order, maybe
    // we could have a ListDocSet() and use the collected array directly.
-    if (pos < ARRAY_COLLECT_SZ) {
+    if (pos < scratch.length) {
      scratch[pos]=doc;
    } else {
      // this conditional could be removed if BitSet was preallocated, but that
      // would take up more memory, and add more GC time...
-      if (bits==null) bits = new BitSet(maxDoc);
-      bits.set(doc);
+      if (bits==null) bits = new OpenBitSet(maxDoc);
+      bits.fastSet(doc);
    }

    pos++;
  }

  public DocSet getDocSet() {
-    if (pos<=ARRAY_COLLECT_SZ) {
-      return new HashDocSet(scratch,0,pos);
+    if (pos<=scratch.length) {
+      return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
    } else {
      // set the bits for ids that were collected in the array
-      for (int i=0; i<ARRAY_COLLECT_SZ; i++) bits.set(scratch[i]);
+      for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
      return new BitDocSet(bits,pos);
    }
  }
--- a/src/java/org/apache/solr/search/HashDocSet.java
+++ b/src/java/org/apache/solr/search/HashDocSet.java
@ -16,7 +16,7 @@

 package org.apache.solr.search;

-import org.apache.solr.core.SolrConfig;
+import org.apache.solr.util.BitUtil;


 /**
@ -30,10 +30,14 @@ import org.apache.solr.core.SolrConfig;
 * @since solr 0.9
 */
 public final class HashDocSet extends DocSetBase {
-  // keep track of the inverse of the Loadfactor  since
-  // multiplication is so much faster than division.
-  final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
-  public final static int MAX_SIZE = SolrConfig.config.getInt("//HashDocSet/@maxSize",-1);
+  // final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
+  /** Default load factor to use for HashDocSets.  We keep track of the inverse
+   *  since multiplication is so much faster than division.  The default
+   *  is 1.0f / 0.75f
+   */
+  static float DEFAULT_INVERSE_LOAD_FACTOR = 1.0f /0.75f;
+
+  // public final static int MAX_SIZE = SolrConfig.config.getInt("//HashDocSet/@maxSize",-1);


  // lucene docs are numbered from 0, so a neg number must be used for missing.
@ -47,10 +51,15 @@ public final class HashDocSet extends DocSetBase {
  private final int mask;

  public HashDocSet(int[] docs, int offset, int len) {
-    int tsize = Math.max(nextHighestPowerOfTwo(len), 1);
-    if (tsize < len * inverseLoadfactor) {
+    this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
+  }
+
+  public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
+    int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
+    if (tsize < len * inverseLoadFactor) {
      tsize <<= 1;
    }
+
    tablesize = tsize;
    mask=tablesize-1;

@ -64,18 +73,6 @@ public final class HashDocSet extends DocSetBase {
    size = len;
  }

-  static int nextHighestPowerOfTwo(int v) {
-    v--;
-    v |= v >> 1;
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    v |= v >> 16;
-    v++;
-    return v;
-  }
-
-
  void put(int doc) {
    table[getSlot(doc)]=doc;
  }
--- a/src/java/org/apache/solr/search/SolrIndexSearcher.java
+++ b/src/java/org/apache/solr/search/SolrIndexSearcher.java
@ -29,6 +29,7 @@ import org.apache.solr.core.SolrInfoMBean;
 import org.apache.solr.core.SolrInfoRegistry;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.util.NamedList;
+import org.apache.solr.util.OpenBitSet;

 import java.io.IOException;
 import java.net.URL;
@ -459,9 +460,22 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {


  protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
-    SetHitCollector hc = new SetHitCollector(filter, maxDoc());
+    if (filter==null) {
+      DocSetHitCollector hc = new DocSetHitCollector(maxDoc());
      searcher.search(query,null,hc);
      return hc.getDocSet();
+    } else {
+      // FUTURE: if the filter is sorted by docid, could use skipTo (SkipQueryFilter)
+      final DocSetHitCollector hc = new DocSetHitCollector(maxDoc());
+      final DocSet filt = filter;
+      searcher.search(query, null, new HitCollector() {
+        public void collect(int doc, float score) {
+          if (filt.exists(doc)) hc.collect(doc,score);
+        }
+      }
+      );
+      return hc.getDocSet();
+    }
  }


@ -521,7 +535,12 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
  * This method is not cache-aware and no caches are checked.
  */
  public DocSet convertFilter(Filter lfilter) throws IOException {
-    return new BitDocSet(lfilter.bits(this.reader));
+    BitSet bs = lfilter.bits(this.reader);
+    OpenBitSet obs = new OpenBitSet(bs.size());
+    for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)) {
+      obs.fastSet(i);
+    }
+    return new BitDocSet(obs);
  }

  /**
@ -1192,63 +1211,6 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
 }


-// Todo: counting only hit collector (for speed comparison w/ caching filters)
-// todo: fast term query
-// todo: do a both hit collector that can get a DocList and DocSet at the same time
-
-final class SetHitCollector extends HitCollector {
-  int pos=0;
-  final DocSet filter;
-    // should we bother with filters at this point?
-    // how much faster would it be to take the check for
-    // filter!=null out of the loop??? depends on HotSpot... it may
-    // optimize it anyway.
-
-  final BitSet bits;
-
-  // in case there aren't that many hits, we may not want a very sparse
-  // bit array.  Optimistically collect the first few docs in an array
-  // in case there are only a few.
-  static final int ARRAY_COLLECT_SZ=HashDocSet.MAX_SIZE;
-  final int[] scratch = ARRAY_COLLECT_SZ>0 ? new int[ARRAY_COLLECT_SZ] : null;
-
-  public SetHitCollector(DocSet filter, int maxDoc) {
-    bits = new BitSet(maxDoc);
-    this.filter = filter;
-  }
-
-  public void collect(int doc, float score) {
-    if (filter!=null && !filter.exists(doc)) return;
-
-    // OPTIMIZATION: should I only set bits *after* I have run out of
-    // room in scratch?  (then at the end I could add all of the docs
-    // in scratch to the bitset.
-    bits.set(doc);
-
-    // optimistically collect the first docs in an array
-    // in case the total number will be small enough to represent
-    // as a HashDocSet() instead...
-    // It is assumed that storing in this array will be quicker to convert
-    // than scanning through a potentially huge bit vector.
-    // FUTURE: when search methods all start returning docs in order, maybe
-    // we could have a SortedListDocSet() and use the collected array directly.
-    if (pos < ARRAY_COLLECT_SZ) {
-      scratch[pos]=doc;
-    }
-
-    pos++;
-  }
-
-  public DocSet getDocSet() {
-    if (pos<=ARRAY_COLLECT_SZ) {
-      return new HashDocSet(scratch,0,pos);
-    }
-    return new BitDocSet(bits,pos);
-  }
-
-}
-
-

 // Lucene's HitQueue isn't public, so here is our own.
 final class ScorePriorityQueue extends PriorityQueue {
--- a/src/java/org/apache/solr/search/test/TestDocSet.java
+++ b/src/java/org/apache/solr/search/test/TestDocSet.java
@ -19,6 +19,7 @@ package org.apache.solr.search.test;
 import org.apache.solr.search.BitDocSet;
 import org.apache.solr.search.HashDocSet;
 import org.apache.solr.search.DocSet;
+import org.apache.solr.util.OpenBitSet;

 import java.util.Random;
 import java.util.BitSet;
@ -38,20 +39,20 @@ public class TestDocSet {
  static Random rand = new Random();


-  static BitSet bs;
+  static OpenBitSet bs;
  static BitDocSet bds;
  static HashDocSet hds;
  static int[] ids; // not unique

  static void generate(int maxSize, int bitsToSet) {
-    bs = new BitSet(maxSize);
+    bs = new OpenBitSet(maxSize);
    ids = new int[bitsToSet];
    int count=0;
    if (maxSize>0) {
      for (int i=0; i<bitsToSet; i++) {
        int id=rand.nextInt(maxSize);
        if (!bs.get(id)) {
-          bs.set(id);
+          bs.fastSet(id);
          ids[count++]=id;
        }
      }
@ -79,7 +80,7 @@ public class TestDocSet {

    int ret=0;

-    BitSet[] sets = new BitSet[numSets];
+    OpenBitSet[] sets = new OpenBitSet[numSets];
    DocSet[] bset = new DocSet[numSets];
    DocSet[] hset = new DocSet[numSets];
    BitSet scratch=new BitSet();
@ -96,14 +97,14 @@ public class TestDocSet {
    if ("test".equals(test)) {
      for (int it=0; it<iter; it++) {
        generate(randSize ? rand.nextInt(bitSetSize) : bitSetSize, numBitsSet);
-        BitSet bs1=bs;
+        OpenBitSet bs1=bs;
        BitDocSet bds1=bds;
        HashDocSet hds1=hds;
        generate(randSize ? rand.nextInt(bitSetSize) : bitSetSize, numBitsSet);

-        BitSet res = ((BitSet)bs1.clone());
+        OpenBitSet res = ((OpenBitSet)bs1.clone());
        res.and(bs);
-        int icount = res.cardinality();
+        int icount = (int)res.cardinality();

        test(bds1.intersection(bds).size() == icount);
        test(bds1.intersectionSize(bds) == icount);
--- a/src/java/org/apache/solr/tst/TestRequestHandler.java
+++ b/src/java/org/apache/solr/tst/TestRequestHandler.java
@ -29,6 +29,7 @@ import java.net.URL;

 import org.apache.solr.util.StrUtils;
 import org.apache.solr.util.NamedList;
+import org.apache.solr.util.OpenBitSet;
 import org.apache.solr.search.*;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrException;
@ -223,9 +224,9 @@ public class TestRequestHandler implements SolrRequestHandler {
      test( both3.docList.equals(both.docList) );
      test( both3.docSet.equals(both.docSet) );

-      BitSet bits = both.docSet.getBits();
-      BitSet neg = ((BitSet)bits.clone());
-      neg.flip(0, bits.length());
+      OpenBitSet bits = both.docSet.getBits();
+      OpenBitSet neg = ((OpenBitSet)bits.clone());
+      neg.flip(0, bits.capacity());

      // use the negative as a filter (should result in 0 matches)
      // todo - fix if filter is not null
--- a/src/java/org/apache/solr/util/BitSetIterator.java
+++ b/src/java/org/apache/solr/util/BitSetIterator.java
@ -0,0 +1,139 @@
+package org.apache.solr.util;
+
+/** An iterator to iterate over set bits in an OpenBitSet.
+ * This is faster than nextSetBit() for iterating over the complete set of bits,
+ * especially when the density of the bits set is high.
+ *
+ * @author yonik
+ * @version $Id$
+ */
+public class BitSetIterator {
+
+  // The General Idea: instead of having an array per byte that has
+  // the offsets of the next set bit, that array could be
+  // packed inside a 32 bit integer (8 4 bit numbers).  That
+  // should be faster than accessing an array for each index, and
+  // the total array size is kept smaller (256*sizeof(int))=1K
+  protected final static int[] bitlist={
+    0x0,0x1,0x2,0x21,0x3,0x31,0x32,0x321,0x4,0x41,0x42,0x421,0x43,0x431,0x432,0x4321,0x5,0x51,0x52,0x521,0x53,0x531,0x532,0x5321,0x54,0x541,0x542,0x5421,0x543,0x5431,0x5432,0x54321,0x6,0x61,0x62,0x621,0x63,0x631,0x632,0x6321,0x64,0x641,0x642,0x6421,0x643,0x6431,0x6432,0x64321,0x65,0x651,0x652,0x6521,0x653,0x6531,0x6532,0x65321,0x654,0x6541,0x6542,0x65421,0x6543,0x65431,0x65432,0x654321,0x7,0x71,0x72,0x721,0x73,0x731,0x732,0x7321,0x74,0x741,0x742,0x7421,0x743,0x7431,0x7432,0x74321,0x75,0x751,0x752,0x7521,0x753,0x7531,0x7532,0x75321,0x754,0x7541,0x7542,0x75421,0x7543,0x75431,0x75432,0x754321,0x76,0x761,0x762,0x7621,0x763,0x7631,0x7632,0x76321,0x764,0x7641,0x7642,0x76421,0x7643,0x76431,0x76432,0x764321,0x765,0x7651,0x7652,0x76521,0x7653,0x76531,0x76532,0x765321,0x7654,0x76541,0x76542,0x765421,0x76543,0x765431,0x765432,0x7654321,0x8,0x81,0x82,0x821,0x83,0x831,0x832,0x8321,0x84,0x841,0x842,0x8421,0x843,0x8431,0x8432,0x84321,0x85,0x851,0x852,0x8521,0x853,0x8531,0x8532,0x85321,0x854,0x8541,0x8542,0x85421,0x8543,0x85431,0x85432,0x854321,0x86,0x861,0x862,0x8621,0x863,0x8631,0x8632,0x86321,0x864,0x8641,0x8642,0x86421,0x8643,0x86431,0x86432,0x864321,0x865,0x8651,0x8652,0x86521,0x8653,0x86531,0x86532,0x865321,0x8654,0x86541,0x86542,0x865421,0x86543,0x865431,0x865432,0x8654321,0x87,0x871,0x872,0x8721,0x873,0x8731,0x8732,0x87321,0x874,0x8741,0x8742,0x87421,0x8743,0x87431,0x87432,0x874321,0x875,0x8751,0x8752,0x87521,0x8753,0x87531,0x87532,0x875321,0x8754,0x87541,0x87542,0x875421,0x87543,0x875431,0x875432,0x8754321,0x876,0x8761,0x8762,0x87621,0x8763,0x87631,0x87632,0x876321,0x8764,0x87641,0x87642,0x876421,0x87643,0x876431,0x876432,0x8764321,0x8765,0x87651,0x87652,0x876521,0x87653,0x876531,0x876532,0x8765321,0x87654,0x876541,0x876542,0x8765421,0x876543,0x8765431,0x8765432,0x87654321
+  };
+  /***** the python code that generated bitlist
+  def bits2int(val):
+  arr=0
+  for shift in range(8,0,-1):
+    if val & 0x80:
+      arr = (arr << 4) | shift
+    val = val << 1
+  return arr
+
+  def int_table():
+    tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ]
+    return ','.join(tbl)
+  ******/
+
+  // hmmm, what about an iterator that finds zeros though,
+  // or a reverse iterator... should they be separate classes
+  // for efficiency, or have a common root interface?  (or
+  // maybe both?  could ask for a SetBitsIterator, etc...
+
+
+  private final long[] arr;
+  private final int words;
+  private int i=-1;
+  private long word;
+  private int wordShift;
+  private int indexArray;
+
+  public BitSetIterator(OpenBitSet obs) {
+    this(obs.getBits(), obs.getNumWords());
+  }
+
+  public BitSetIterator(long[] bits, int numWords) {
+    arr = bits;
+    words = numWords;
+  }
+
+  // 64 bit shifts
+  private void shift() {
+    if ((int)word ==0) {wordShift +=32; word = word >>>32; }
+    if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; }
+    if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; }
+    indexArray = bitlist[(int)word & 0xff];
+  }
+
+  /***** alternate shift implementations
+  // 32 bit shifts, but a long shift needed at the end
+  private void shift2() {
+    int y = (int)word;
+    if (y==0) {wordShift +=32; y = (int)(word >>>32); }
+    if ((y & 0x0000FFFF) == 0) { wordShift +=16; y>>>=16; }
+    if ((y & 0x000000FF) == 0) { wordShift +=8; y>>>=8; }
+    indexArray = bitlist[y & 0xff];
+    word >>>= (wordShift +1);
+  }
+
+  private void shift3() {
+    int lower = (int)word;
+    int lowByte = lower & 0xff;
+    if (lowByte != 0) {
+      indexArray=bitlist[lowByte];
+      return;
+    }
+    shift();
+  }
+  ******/
+
+  public int next() {
+    if (indexArray==0) {
+      if (word!=0) {
+        word >>>= 8;
+        wordShift += 8;
+      }
+
+      while (word==0) {
+        if (++i >= words) return -1;
+        word = arr[i];
+        wordShift =-1;  // loop invariant code motion should move this
+      }
+
+      // after the first time, should I go with a linear search, or
+      // stick with the binary search in shift?
+      shift();
+    }
+
+    int bitIndex = (indexArray & 0x0f) + wordShift;
+    indexArray >>>= 4;
+    // should i<<6 be cached as a separate variable?
+    // it would only save one cycle in the best circumstances.
+    return (i<<6) + bitIndex;
+  }
+
+  int next(int fromIndex) {
+    indexArray=0;
+    i = fromIndex >> 6;
+    if (i>=words) {
+      word =0; // setup so next() will also return -1
+      return -1;
+    }
+    wordShift = fromIndex & 0x3f;
+    word = arr[i] >>> wordShift;
+    if (word !=0) {
+      wordShift--; // compensate for 1 based arrIndex
+    } else {
+      while (word ==0) {
+        if (++i >= words) return -1;
+        word = arr[i];
+      }
+      wordShift =-1;
+    }
+
+    shift();
+
+    int bitIndex = (indexArray & 0x0f) + wordShift;
+    indexArray >>>= 4;
+    // should i<<6 be cached as a separate variable?
+    // it would only save one cycle in the best circumstances.
+    return (i<<6) + bitIndex;
+  }
+
+}
--- a/src/java/org/apache/solr/util/BitUtil.java
+++ b/src/java/org/apache/solr/util/BitUtil.java
@ -0,0 +1,783 @@
+package org.apache.solr.util;
+
+/**  A variety of high efficiencly bit twiddling routines.
+ *
+ * @author yonik
+ * @version $Id$
+ */
+public class BitUtil {
+
+  /** Returns the number of bits set in the long */
+  public static int pop(long x) {
+  /* Hacker's Delight 32 bit pop function:
+   * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
+   *
+  int pop(unsigned x) {
+     x = x - ((x >> 1) & 0x55555555);
+     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+     x = (x + (x >> 4)) & 0x0F0F0F0F;
+     x = x + (x >> 8);
+     x = x + (x >> 16);
+     return x & 0x0000003F;
+    }
+  ***/
+
+    // 64 bit java version of the C function from above
+    x = x - ((x >>> 1) & 0x5555555555555555L);
+    x = (x & 0x3333333333333333L) + ((x >>>2 ) & 0x3333333333333333L);
+    x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL;
+    x = x + (x >>> 8);
+    x = x + (x >>> 16);
+    x = x + (x >>> 32);
+    return ((int)x) & 0x7F;
+  }
+
+  /*** Returns the number of set bits in an array of longs. */
+  public static long pop_array(long A[], int wordOffset, int numWords) {
+    /*
+    * Robert Harley and David Seal's bit counting algorithm, as documented
+    * in the revisions of Hacker's Delight
+    * http://www.hackersdelight.org/revisions.pdf
+    * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
+    *
+    * This function was adapted to Java, and extended to use 64 bit words.
+    * if only we had access to wider registers like SSE from java...
+    *
+    * This function can be transformed to compute the popcount of other functions
+    * on bitsets via something like this:
+    * sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
+    *
+    */
+    int n = wordOffset+numWords;
+    long tot=0, tot8=0;
+    long ones=0, twos=0, fours=0;
+
+    int i;
+    for (i = wordOffset; i <= n - 8; i+=8) {
+      /***  C macro from Hacker's Delight
+       #define CSA(h,l, a,b,c) \
+       {unsigned u = a ^ b; unsigned v = c; \
+       h = (a & b) | (u & v); l = u ^ v;}
+       ***/
+
+      long twosA,twosB,foursA,foursB,eights;
+
+      // CSA(twosA, ones, ones, A[i], A[i+1])
+      {
+        long b=A[i], c=A[i+1];
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, A[i+2], A[i+3])
+      {
+        long b=A[i+2], c=A[i+3];
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursA, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      //CSA(twosA, ones, ones, A[i+4], A[i+5])
+      {
+        long b=A[i+4], c=A[i+5];
+        long u=ones^b;
+        twosA=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, A[i+6], A[i+7])
+      {
+        long b=A[i+6], c=A[i+7];
+        long u=ones^b;
+        twosB=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursB, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursB=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+
+      //CSA(eights, fours, fours, foursA, foursB)
+      {
+        long u=fours^foursA;
+        eights=(fours&foursA)|(u&foursB);
+        fours=u^foursB;
+      }
+      tot8 += pop(eights);
+    }
+
+    // handle trailing words in a binary-search manner...
+    // derived from the loop above by setting specific elements to 0.
+    // the original method in Hackers Delight used a simple for loop:
+    //   for (i = i; i < n; i++)      // Add in the last elements
+    //  tot = tot + pop(A[i]);
+
+    if (i<=n-4) {
+      long twosA, twosB, foursA, eights;
+      {
+        long b=A[i], c=A[i+1];
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      {
+        long b=A[i+2], c=A[i+3];
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=4;
+    }
+
+    if (i<=n-2) {
+      long b=A[i], c=A[i+1];
+      long u=ones ^ b;
+      long twosA=(ones & b)|( u & c);
+      ones=u^c;
+
+      long foursA=twos&twosA;
+      twos=twos^twosA;
+
+      long eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=2;
+    }
+
+    if (i<n) {
+      tot += pop(A[i]);
+    }
+
+    tot += (pop(fours)<<2)
+            + (pop(twos)<<1)
+            + pop(ones)
+            + (tot8<<3);
+
+    return tot;
+  }
+
+  /** Returns the popcount or cardinality of the two sets after an intersection.
+   * Neither array is modified.
+   */
+  public static long pop_intersect(long A[], long B[], int wordOffset, int numWords) {
+    // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
+    int n = wordOffset+numWords;
+    long tot=0, tot8=0;
+    long ones=0, twos=0, fours=0;
+
+    int i;
+    for (i = wordOffset; i <= n - 8; i+=8) {
+      long twosA,twosB,foursA,foursB,eights;
+
+      // CSA(twosA, ones, ones, (A[i] & B[i]), (A[i+1] & B[i+1]))
+      {
+        long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, (A[i+2] & B[i+2]), (A[i+3] & B[i+3]))
+      {
+        long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursA, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      //CSA(twosA, ones, ones, (A[i+4] & B[i+4]), (A[i+5] & B[i+5]))
+      {
+        long b=(A[i+4] & B[i+4]), c=(A[i+5] & B[i+5]);
+        long u=ones^b;
+        twosA=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, (A[i+6] & B[i+6]), (A[i+7] & B[i+7]))
+      {
+        long b=(A[i+6] & B[i+6]), c=(A[i+7] & B[i+7]);
+        long u=ones^b;
+        twosB=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursB, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursB=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+
+      //CSA(eights, fours, fours, foursA, foursB)
+      {
+        long u=fours^foursA;
+        eights=(fours&foursA)|(u&foursB);
+        fours=u^foursB;
+      }
+      tot8 += pop(eights);
+    }
+
+
+    if (i<=n-4) {
+      long twosA, twosB, foursA, eights;
+      {
+        long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      {
+        long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=4;
+    }
+
+    if (i<=n-2) {
+      long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
+      long u=ones ^ b;
+      long twosA=(ones & b)|( u & c);
+      ones=u^c;
+
+      long foursA=twos&twosA;
+      twos=twos^twosA;
+
+      long eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=2;
+    }
+
+    if (i<n) {
+      tot += pop((A[i] & B[i]));
+    }
+
+    tot += (pop(fours)<<2)
+            + (pop(twos)<<1)
+            + pop(ones)
+            + (tot8<<3);
+
+    return tot;
+  }
+
+  /** Returns the popcount or cardinality of the union of two sets.
+    * Neither array is modified.
+    */
+   public static long pop_union(long A[], long B[], int wordOffset, int numWords) {
+     // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \| B[\1]\)/g'
+     int n = wordOffset+numWords;
+     long tot=0, tot8=0;
+     long ones=0, twos=0, fours=0;
+
+     int i;
+     for (i = wordOffset; i <= n - 8; i+=8) {
+       /***  C macro from Hacker's Delight
+        #define CSA(h,l, a,b,c) \
+        {unsigned u = a ^ b; unsigned v = c; \
+        h = (a & b) | (u & v); l = u ^ v;}
+        ***/
+
+       long twosA,twosB,foursA,foursB,eights;
+
+       // CSA(twosA, ones, ones, (A[i] | B[i]), (A[i+1] | B[i+1]))
+       {
+         long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
+         long u=ones ^ b;
+         twosA=(ones & b)|( u & c);
+         ones=u^c;
+       }
+       // CSA(twosB, ones, ones, (A[i+2] | B[i+2]), (A[i+3] | B[i+3]))
+       {
+         long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
+         long u=ones^b;
+         twosB =(ones&b)|(u&c);
+         ones=u^c;
+       }
+       //CSA(foursA, twos, twos, twosA, twosB)
+       {
+         long u=twos^twosA;
+         foursA=(twos&twosA)|(u&twosB);
+         twos=u^twosB;
+       }
+       //CSA(twosA, ones, ones, (A[i+4] | B[i+4]), (A[i+5] | B[i+5]))
+       {
+         long b=(A[i+4] | B[i+4]), c=(A[i+5] | B[i+5]);
+         long u=ones^b;
+         twosA=(ones&b)|(u&c);
+         ones=u^c;
+       }
+       // CSA(twosB, ones, ones, (A[i+6] | B[i+6]), (A[i+7] | B[i+7]))
+       {
+         long b=(A[i+6] | B[i+6]), c=(A[i+7] | B[i+7]);
+         long u=ones^b;
+         twosB=(ones&b)|(u&c);
+         ones=u^c;
+       }
+       //CSA(foursB, twos, twos, twosA, twosB)
+       {
+         long u=twos^twosA;
+         foursB=(twos&twosA)|(u&twosB);
+         twos=u^twosB;
+       }
+
+       //CSA(eights, fours, fours, foursA, foursB)
+       {
+         long u=fours^foursA;
+         eights=(fours&foursA)|(u&foursB);
+         fours=u^foursB;
+       }
+       tot8 += pop(eights);
+     }
+
+
+     if (i<=n-4) {
+       long twosA, twosB, foursA, eights;
+       {
+         long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
+         long u=ones ^ b;
+         twosA=(ones & b)|( u & c);
+         ones=u^c;
+       }
+       {
+         long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
+         long u=ones^b;
+         twosB =(ones&b)|(u&c);
+         ones=u^c;
+       }
+       {
+         long u=twos^twosA;
+         foursA=(twos&twosA)|(u&twosB);
+         twos=u^twosB;
+       }
+       eights=fours&foursA;
+       fours=fours^foursA;
+
+       tot8 += pop(eights);
+       i+=4;
+     }
+
+     if (i<=n-2) {
+       long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
+       long u=ones ^ b;
+       long twosA=(ones & b)|( u & c);
+       ones=u^c;
+
+       long foursA=twos&twosA;
+       twos=twos^twosA;
+
+       long eights=fours&foursA;
+       fours=fours^foursA;
+
+       tot8 += pop(eights);
+       i+=2;
+     }
+
+     if (i<n) {
+       tot += pop((A[i] | B[i]));
+     }
+
+     tot += (pop(fours)<<2)
+             + (pop(twos)<<1)
+             + pop(ones)
+             + (tot8<<3);
+
+     return tot;
+   }
+
+  /** Returns the popcount or cardinality of A & ~B
+   * Neither array is modified.
+   */
+  public static long pop_andnot(long A[], long B[], int wordOffset, int numWords) {
+    // generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& ~B[\1]\)/g'
+    int n = wordOffset+numWords;
+    long tot=0, tot8=0;
+    long ones=0, twos=0, fours=0;
+
+    int i;
+    for (i = wordOffset; i <= n - 8; i+=8) {
+      /***  C macro from Hacker's Delight
+       #define CSA(h,l, a,b,c) \
+       {unsigned u = a ^ b; unsigned v = c; \
+       h = (a & b) | (u & v); l = u ^ v;}
+       ***/
+
+      long twosA,twosB,foursA,foursB,eights;
+
+      // CSA(twosA, ones, ones, (A[i] & ~B[i]), (A[i+1] & ~B[i+1]))
+      {
+        long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, (A[i+2] & ~B[i+2]), (A[i+3] & ~B[i+3]))
+      {
+        long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursA, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      //CSA(twosA, ones, ones, (A[i+4] & ~B[i+4]), (A[i+5] & ~B[i+5]))
+      {
+        long b=(A[i+4] & ~B[i+4]), c=(A[i+5] & ~B[i+5]);
+        long u=ones^b;
+        twosA=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, (A[i+6] & ~B[i+6]), (A[i+7] & ~B[i+7]))
+      {
+        long b=(A[i+6] & ~B[i+6]), c=(A[i+7] & ~B[i+7]);
+        long u=ones^b;
+        twosB=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursB, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursB=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+
+      //CSA(eights, fours, fours, foursA, foursB)
+      {
+        long u=fours^foursA;
+        eights=(fours&foursA)|(u&foursB);
+        fours=u^foursB;
+      }
+      tot8 += pop(eights);
+    }
+
+
+    if (i<=n-4) {
+      long twosA, twosB, foursA, eights;
+      {
+        long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      {
+        long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=4;
+    }
+
+    if (i<=n-2) {
+      long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
+      long u=ones ^ b;
+      long twosA=(ones & b)|( u & c);
+      ones=u^c;
+
+      long foursA=twos&twosA;
+      twos=twos^twosA;
+
+      long eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=2;
+    }
+
+    if (i<n) {
+      tot += pop((A[i] & ~B[i]));
+    }
+
+    tot += (pop(fours)<<2)
+            + (pop(twos)<<1)
+            + pop(ones)
+            + (tot8<<3);
+
+    return tot;
+  }
+
+  public static long pop_xor(long A[], long B[], int wordOffset, int numWords) {
+    int n = wordOffset+numWords;
+    long tot=0, tot8=0;
+    long ones=0, twos=0, fours=0;
+
+    int i;
+    for (i = wordOffset; i <= n - 8; i+=8) {
+      /***  C macro from Hacker's Delight
+       #define CSA(h,l, a,b,c) \
+       {unsigned u = a ^ b; unsigned v = c; \
+       h = (a & b) | (u & v); l = u ^ v;}
+       ***/
+
+      long twosA,twosB,foursA,foursB,eights;
+
+      // CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i+1] ^ B[i+1]))
+      {
+        long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, (A[i+2] ^ B[i+2]), (A[i+3] ^ B[i+3]))
+      {
+        long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursA, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      //CSA(twosA, ones, ones, (A[i+4] ^ B[i+4]), (A[i+5] ^ B[i+5]))
+      {
+        long b=(A[i+4] ^ B[i+4]), c=(A[i+5] ^ B[i+5]);
+        long u=ones^b;
+        twosA=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      // CSA(twosB, ones, ones, (A[i+6] ^ B[i+6]), (A[i+7] ^ B[i+7]))
+      {
+        long b=(A[i+6] ^ B[i+6]), c=(A[i+7] ^ B[i+7]);
+        long u=ones^b;
+        twosB=(ones&b)|(u&c);
+        ones=u^c;
+      }
+      //CSA(foursB, twos, twos, twosA, twosB)
+      {
+        long u=twos^twosA;
+        foursB=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+
+      //CSA(eights, fours, fours, foursA, foursB)
+      {
+        long u=fours^foursA;
+        eights=(fours&foursA)|(u&foursB);
+        fours=u^foursB;
+      }
+      tot8 += pop(eights);
+    }
+
+
+    if (i<=n-4) {
+      long twosA, twosB, foursA, eights;
+      {
+        long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
+        long u=ones ^ b;
+        twosA=(ones & b)|( u & c);
+        ones=u^c;
+      }
+      {
+        long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
+        long u=ones^b;
+        twosB =(ones&b)|(u&c);
+        ones=u^c;
+      }
+      {
+        long u=twos^twosA;
+        foursA=(twos&twosA)|(u&twosB);
+        twos=u^twosB;
+      }
+      eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=4;
+    }
+
+    if (i<=n-2) {
+      long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
+      long u=ones ^ b;
+      long twosA=(ones & b)|( u & c);
+      ones=u^c;
+
+      long foursA=twos&twosA;
+      twos=twos^twosA;
+
+      long eights=fours&foursA;
+      fours=fours^foursA;
+
+      tot8 += pop(eights);
+      i+=2;
+    }
+
+    if (i<n) {
+      tot += pop((A[i] ^ B[i]));
+    }
+
+    tot += (pop(fours)<<2)
+            + (pop(twos)<<1)
+            + pop(ones)
+            + (tot8<<3);
+
+    return tot;
+  }
+
+  /* python code to generate ntzTable
+  def ntz(val):
+    if val==0: return 8
+    i=0
+    while (val&0x01)==0:
+      i = i+1
+      val >>= 1
+    return i
+  print ','.join([ str(ntz(i)) for i in range(256) ])
+  ***/
+  /** table of number of trailing zeros in a byte */
+  public static final byte[] ntzTable = {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
+
+
+  /** Returns number of trailing zeros in the 64 bit long value. */
+  public static int ntz(long val) {
+    // A full binary search to determine the low byte was slower than
+    // a linear search for nextSetBit().  This is most likely because
+    // the implementation of nextSetBit() shifts bits to the right, increasing
+    // the probability that the first non-zero byte is in the rhs.
+    //
+    // This implementation does a single binary search at the top level only
+    // so that all other bit shifting can be done on ints instead of longs to
+    // remain friendly to 32 bit architectures.  In addition, the case of a
+    // non-zero first byte is checked for first because it is the most common
+    // in dense bit arrays.
+
+    int lower = (int)val;
+    int lowByte = lower & 0xff;
+    if (lowByte != 0) return ntzTable[lowByte];
+
+    if (lower!=0) {
+      lowByte = (lower>>>8) & 0xff;
+      if (lowByte != 0) return ntzTable[lowByte] + 8;
+      lowByte = (lower>>>16) & 0xff;
+      if (lowByte != 0) return ntzTable[lowByte] + 16;
+      // no need to mask off low byte for the last byte in the 32 bit word
+      // no need to check for zero on the last byte either.
+      return ntzTable[lower>>>24] + 24;
+    } else {
+      // grab upper 32 bits
+      int upper=(int)(val>>32);
+      lowByte = upper & 0xff;
+      if (lowByte != 0) return ntzTable[lowByte] + 32;
+      lowByte = (upper>>>8) & 0xff;
+      if (lowByte != 0) return ntzTable[lowByte] + 40;
+      lowByte = (upper>>>16) & 0xff;
+      if (lowByte != 0) return ntzTable[lowByte] + 48;
+      // no need to mask off low byte for the last byte in the 32 bit word
+      // no need to check for zero on the last byte either.
+      return ntzTable[upper>>>24] + 56;
+    }
+  }
+
+  /** returns 0 based index of first set bit
+   * (only works for x!=0)
+   * <br/> This is an alternate implementation of ntz()
+   */
+  public static int ntz2(long x) {
+   int n = 0;
+   int y = (int)x;
+   if (y==0) {n+=32; y = (int)(x>>>32); }   // the only 64 bit shift necessary
+   if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
+   if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
+   return (ntzTable[ y & 0xff ]) + n;
+  }
+
+  /** returns 0 based index of first set bit
+   * <br/> This is an alternate implementation of ntz()
+   */
+  public static int ntz3(long x) {
+   // another implementation taken from Hackers Delight, extended to 64 bits
+   // and converted to Java.
+   // Many 32 bit ntz algorithms are at http://www.hackersdelight.org/HDcode/ntz.cc
+   int n = 1;
+
+   // do the first step as a long, all others as ints.
+   int y = (int)x;
+   if (y==0) {n+=32; y = (int)(x>>>32); }
+   if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
+   if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
+   if ((y & 0x0000000F) == 0) { n+=4; y>>>=4; }
+   if ((y & 0x00000003) == 0) { n+=2; y>>>=2; }
+   return n - (y & 1);
+  }
+
+
+  /** returns true if v is a power of two or zero*/
+  public static boolean isPowerOfTwo(int v) {
+    return ((v & (v-1)) == 0);
+  }
+
+  /** returns true if v is a power of two or zero*/
+  public static boolean isPowerOfTwo(long v) {
+    return ((v & (v-1)) == 0);
+  }
+
+  /** returns the next highest power of two, or the current value if it's already a power of two or zero*/
+  public static int nextHighestPowerOfTwo(int v) {
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v;
+  }
+
+  /** returns the next highest power of two, or the current value if it's already a power of two or zero*/
+   public static long nextHighestPowerOfTwo(long v) {
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v |= v >> 32;
+    v++;
+    return v;
+  }
+
+}
--- a/src/java/org/apache/solr/util/OpenBitSet.java
+++ b/src/java/org/apache/solr/util/OpenBitSet.java
@ -0,0 +1,692 @@
+package org.apache.solr.util;
+
+import java.util.Arrays;
+import java.io.Serializable;
+
+/** An "open" BitSet implementation that allows direct access to the array of words
+ * storing the bits.
+ * <p/>
+ * Unlike java.util.bitet, the fact that bits are packed into an array of longs
+ * is part of the interface.  This allows efficient implementation of other algorithms
+ * by someone other than the author.  It also allows one to efficiently implement
+ * alternate serialization or interchange formats.
+ * <p/>
+ * <code>OpenBitSet</code> is faster than <code>java.util.BitSet</code> in most operations
+ * and *much* faster at calculating cardinality of sets and results of set operations.
+ * It can also handle sets of larger cardinality (up to 64 * 2**32-1)
+ * <p/>
+ * The goals of <code>OpenBitSet</code> are the fastest implementation possible, and
+ * maximum code reuse.  Extra safety and encapsulation
+ * may always be built on top, but if that's built in, the cost can never be removed (and
+ * hence people re-implement their own version in order to get better performance).
+ * If you want a "safe", totally encapsulated (and slower and limited) BitSet
+ * class, use <code>java.util.BitSet</code>.
+ * <p/>
+ * <h3>Performance Results</h3>
+ *
+ Test system: Pentium 4, Sun Java 1.5_06 -server -Xbatch -Xmx64M
+<br/>BitSet size = 1,000,000
+<br/>Results are java.util.BitSet time divided by OpenBitSet time.
+<table border="1">
+ <tr>
+  <th></th> <th>cardinality</th> <th>intersect_count</th> <th>union</th> <th>nextSetBit</th> <th>get</th> <th>iterator</th>
+ </tr>
+ <tr>
+  <th>50% full</th> <td>3.36</td> <td>3.96</td> <td>1.44</td> <td>1.46</td> <td>1.99</td> <td>1.58</td>
+ </tr>
+ <tr>
+   <th>1% full</th> <td>3.31</td> <td>3.90</td> <td>&nbsp;</td> <td>1.04</td> <td>&nbsp;</td> <td>0.99</td>
+ </tr>
+</table>
+<br/>
+Test system: AMD Opteron, 64 bit linux, Sun Java 1.5_06 -server -Xbatch -Xmx64M
+<br/>BitSet size = 1,000,000
+<br/>Results are java.util.BitSet time divided by OpenBitSet time.
+<table border="1">
+ <tr>
+  <th></th> <th>cardinality</th> <th>intersect_count</th> <th>union</th> <th>nextSetBit</th> <th>get</th> <th>iterator</th>
+ </tr>
+ <tr>
+  <th>50% full</th> <td>2.50</td> <td>3.50</td> <td>1.00</td> <td>1.03</td> <td>1.12</td> <td>1.25</td>
+ </tr>
+ <tr>
+   <th>1% full</th> <td>2.51</td> <td>3.49</td> <td>&nbsp;</td> <td>1.00</td> <td>&nbsp;</td> <td>1.02</td>
+ </tr>
+</table>
+
+ * @author yonik
+ * @version $Id$
+ */
+
+public class OpenBitSet implements Cloneable, Serializable {
+  protected long[] bits;
+  protected int wlen;   // number of words (elements) used in the array
+
+  /** Constructs an OpenBitSet large enough to hold numBits.
+   *
+   * @param numBits
+   */
+  public OpenBitSet(long numBits) {
+    bits = new long[bits2words(numBits)];
+    wlen = bits.length;
+  }
+
+  public OpenBitSet() {
+    this(64);
+  }
+
+  /** Constructs an OpenBitSet from an existing long[].
+   * <br/>
+   * The first 64 bits are in long[0],
+   * with bit index 0 at the least significant bit, and bit index 63 at the most significant.
+   * Given a bit index,
+   * the word containing it is long[index/64], and it is at bit number index%64 within that word.
+   * <p>
+   * numWords are the number of elements in the array that contain
+   * set bits (non-zero longs).
+   * numWords should be &lt= bits.length, and
+   * any existing words in the array at position &gt= numWords should be zero.
+   *
+   */
+  public OpenBitSet(long[] bits, int numWords) {
+    this.bits = bits;
+    this.wlen = numWords;
+  }
+
+  /** Returns the current capacity in bits (1 greater than the index of the last bit) */
+  public long capacity() { return bits.length << 6; }
+
+ /**
+  * Returns the current capacity of this set.  Included for
+  * compatibility.  This is *not* equal to {@see cardinality}
+  */
+  public long size() {
+	  return capacity();
+  }
+
+  /** Returns true if there are no set bits */
+  public boolean isEmpty() { return cardinality()==0; }
+
+  /** Expert: returns the long[] storing the bits */
+  public long[] getBits() { return bits; }
+
+  /** Expert: sets a new long[] to use as the bit storage */
+  public void setBits(long[] bits) { this.bits = bits; }
+
+  /** Expert: gets the number of longs in the array that are in use */
+  public int getNumWords() { return wlen; }
+
+  /** Expert: sets the number of longs in the array that are in use */
+  public void setNumWords(int nWords) { this.wlen=nWords; }
+
+
+
+  /** Returns true or false for the specified bit index. */
+  public boolean get(int index) {
+    int i = index >> 6;               // div 64
+    // signed shift will keep a negative index and force an
+    // array-index-out-of-bounds-exception, removing the need for an explicit check.
+    if (i>=bits.length) return false;
+
+    int bit = index & 0x3f;           // mod 64
+    long bitmask = 1L << bit;
+    return (bits[i] & bitmask) != 0;
+  }
+
+
+ /** Returns true or false for the specified bit index.
+   * The index should be less than the OpenBitSet size
+   */
+  public boolean fastGet(int index) {
+    int i = index >> 6;               // div 64
+    // signed shift will keep a negative index and force an
+    // array-index-out-of-bounds-exception, removing the need for an explicit check.
+    int bit = index & 0x3f;           // mod 64
+    long bitmask = 1L << bit;
+    return (bits[i] & bitmask) != 0;
+  }
+
+
+
+ /** Returns true or false for the specified bit index
+  * The index should be less than the OpenBitSet size
+  */
+  public boolean get(long index) {
+    int i = (int)(index >> 6);             // div 64
+    if (i>=bits.length) return false;
+    int bit = (int)index & 0x3f;           // mod 64
+    long bitmask = 1L << bit;
+    return (bits[i] & bitmask) != 0;
+  }
+
+  /** Returns true or false for the specified bit index.  Allows specifying
+   * an index outside the current size. */
+  public boolean fastGet(long index) {
+    int i = (int)(index >> 6);               // div 64
+    int bit = (int)index & 0x3f;           // mod 64
+    long bitmask = 1L << bit;
+    return (bits[i] & bitmask) != 0;
+  }
+
+  /*
+  // alternate implementation of get()
+  public boolean get1(int index) {
+    int i = index >> 6;                // div 64
+    int bit = index & 0x3f;            // mod 64
+    return ((bits[i]>>>bit) & 0x01) != 0;
+    // this does a long shift and a bittest (on x86) vs
+    // a long shift, and a long AND, (the test for zero is prob a no-op)
+    // testing on a P4 indicates this is slower than (bits[i] & bitmask) != 0;
+  }
+  */
+
+
+  /** returns 1 if the bit is set, 0 if not.
+   * The index should be less than the OpenBitSet size
+   */
+  public int getBit(int index) {
+    int i = index >> 6;                // div 64
+    int bit = index & 0x3f;            // mod 64
+    return ((int)(bits[i]>>>bit)) & 0x01;
+  }
+
+
+  /*
+  public boolean get2(int index) {
+    int word = index >> 6;            // div 64
+    int bit = index & 0x0000003f;     // mod 64
+    return (bits[word] << bit) < 0;   // hmmm, this would work if bit order were reversed
+    // we could right shift and check for parity bit, if it was available to us.
+  }
+  */
+
+  /** sets a bit, expanding the set size if necessary */
+  public void set(long index) {
+    int wordNum = expandingWordNum(index);
+    int bit = (int)index & 0x3f;
+    long bitmask = 1L << bit;
+    bits[wordNum] |= bitmask;
+  }
+
+
+ /** Sets the bit at the specified index.
+  * The index should be less than the OpenBitSet size.
+  */
+  public void fastSet(int index) {
+    int wordNum = index >> 6;      // div 64
+    int bit = index & 0x3f;     // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] |= bitmask;
+  }
+
+ /** Sets the bit at the specified index.
+  * The index should be less than the OpenBitSet size.
+  */
+  public void fastSet(long index) {
+    int wordNum = (int)(index >> 6);
+    int bit = (int)index & 0x3f;
+    long bitmask = 1L << bit;
+    bits[wordNum] |= bitmask;
+  }
+
+  protected int expandingWordNum(long index) {
+    int wordNum = (int)(index >> 6);
+    if (wordNum>=wlen) {
+      ensureCapacity(index+1);
+      wlen = wordNum+1;
+    }
+    return wordNum;
+  }
+
+
+  /** clears a bit.
+   * The index should be less than the OpenBitSet size.
+   */
+  public void fastClear(int index) {
+    int wordNum = index >> 6;
+    int bit = index & 0x03f;
+    long bitmask = 1L << bit;
+    bits[wordNum] &= ~bitmask;
+    // hmmm, it takes one more instruction to clear than it does to set... any
+    // way to work around this?  If there were only 63 bits per word, we could
+    // use a right shift of 10111111...111 in binary to position the 0 in the
+    // correct place (using sign extension).
+    // Could also use Long.rotateRight() or rotateLeft() *if* they were converted
+    // by the JVM into a native instruction.
+    // bits[word] &= Long.rotateLeft(0xfffffffe,bit);
+  }
+
+  /** clears a bit.
+   * The index should be less than the OpenBitSet size.
+   */
+  public void fastClear(long index) {
+    int wordNum = (int)(index >> 6); // div 64
+    int bit = (int)index & 0x3f;     // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] &= ~bitmask;
+  }
+
+  /** clears a bit, allowing access beyond the current set size */
+  public void clear(long index) {
+    int wordNum = (int)(index >> 6); // div 64
+    if (wordNum>=wlen) return;
+    int bit = (int)index & 0x3f;     // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] &= ~bitmask;
+  }
+
+  /** Sets a bit and returns the previous value.
+   * The index should be less than the OpenBitSet size.
+   */
+  public boolean getAndSet(int index) {
+    int wordNum = index >> 6;      // div 64
+    int bit = index & 0x3f;     // mod 64
+    long bitmask = 1L << bit;
+    boolean val = (bits[wordNum] & bitmask) != 0;
+    bits[wordNum] |= bitmask;
+    return val;
+  }
+
+  /** Sets a bit and returns the previous value.
+   * The index should be less than the OpenBitSet size.
+   */
+  public boolean getAndSet(long index) {
+    int wordNum = (int)(index >> 6);      // div 64
+    int bit = (int)index & 0x3f;     // mod 64
+    long bitmask = 1L << bit;
+    boolean val = (bits[wordNum] & bitmask) != 0;
+    bits[wordNum] |= bitmask;
+    return val;
+  }
+
+  /** flips a bit.
+   * The index should be less than the OpenBitSet size.
+   */
+  public void fastFlip(int index) {
+    int wordNum = index >> 6;      // div 64
+    int bit = index & 0x3f;     // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] ^= bitmask;
+  }
+
+  /** flips a bit.
+   * The index should be less than the OpenBitSet size.
+   */
+  public void fastFlip(long index) {
+    int wordNum = (int)(index >> 6);   // div 64
+    int bit = (int)index & 0x3f;       // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] ^= bitmask;
+  }
+
+  /** flips a bit, expanding the set size if necessary */
+  public void flip(long index) {
+    int wordNum = expandingWordNum(index);
+    int bit = (int)index & 0x3f;       // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] ^= bitmask;
+  }
+
+  /** flips a bit and returns the resulting bit value.
+   * The index should be less than the OpenBitSet size.
+   */
+  public boolean flipAndGet(int index) {
+    int wordNum = index >> 6;      // div 64
+    int bit = index & 0x3f;     // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] ^= bitmask;
+    return (bits[wordNum] & bitmask) != 0;
+  }
+
+  /** flips a bit and returns the resulting bit value.
+   * The index should be less than the OpenBitSet size.
+   */
+  public boolean flipAndGet(long index) {
+    int wordNum = (int)(index >> 6);   // div 64
+    int bit = (int)index & 0x3f;       // mod 64
+    long bitmask = 1L << bit;
+    bits[wordNum] ^= bitmask;
+    return (bits[wordNum] & bitmask) != 0;
+  }
+
+  /** Flips a range of bits, expanding the set size if necessary
+   *
+   * @param startIndex lower index
+   * @param endIndex one-past the last bit to flip
+   */
+  public void flip(long startIndex, long endIndex) {
+    if (endIndex <= startIndex) return;
+
+    int oldlen = wlen;
+    ensureCapacity(endIndex);
+    int startWord = (int)(startIndex>>6);
+    int endWord   = (int)(endIndex>>6);
+
+    /*** Grrr, java shifting wraps around so -1L>>64 == -1
+    long startmask = -1L << (startIndex & 0x3f);     // example: 11111...111000
+    long endmask = -1L >>> (64-(endIndex & 0x3f));   // example: 00111...111111
+    ***/
+
+    long startmask = -1L << startIndex;
+    long endmask = (endIndex&0x3c)==0 ? 0 : -1L >>> (64-endIndex);
+
+    if (this.wlen <= endWord) {
+      this.wlen = endWord;
+      if (endmask!=0) this.wlen++;
+    }
+
+    if (startWord == endWord) {
+      bits[startWord] ^= (startmask & endmask);
+      return;
+    }
+
+    bits[startWord] ^= startmask;
+
+    int middle = Math.min(oldlen,endWord);
+    for (int i=startWord+1; i<middle; i++) {
+      bits[i] = ~bits[i];
+    }
+
+    if (endWord>middle) {
+      Arrays.fill(bits,middle,endWord,-1L);
+    }
+
+    if (endmask!=0) {
+      bits[endWord] ^= endmask;
+    }
+  }
+
+
+  /*
+  public static int pop(long v0, long v1, long v2, long v3) {
+    // derived from pop_array by setting last four elems to 0.
+    // exchanges one pop() call for 10 elementary operations
+    // saving about 7 instructions... is there a better way?
+      long twosA=v0 & v1;
+      long ones=v0^v1;
+
+      long u2=ones^v2;
+      long twosB =(ones&v2)|(u2&v3);
+      ones=u2^v3;
+
+      long fours=(twosA&twosB);
+      long twos=twosA^twosB;
+
+      return (pop(fours)<<2)
+             + (pop(twos)<<1)
+             + pop(ones);
+
+  }
+  */
+
+
+  /** @return the number of set bits */
+  public long cardinality() {
+    return BitUtil.pop_array(bits,0,wlen);
+  }
+
+ /** Returns the popcount or cardinality of the intersection of the two sets.
+   * Neither set is modified.
+   */
+  public static long intersectionCount(OpenBitSet a, OpenBitSet b) {
+    return BitUtil.pop_intersect(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
+ }
+
+  /** Returns the popcount or cardinality of the union of the two sets.
+    * Neither set is modified.
+    */
+  public static long unionCount(OpenBitSet a, OpenBitSet b) {
+    long tot = BitUtil.pop_union(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
+    if (a.wlen < b.wlen) {
+      tot += BitUtil.pop_array(b.bits, a.wlen, b.wlen-a.wlen);
+    } else if (a.wlen > b.wlen) {
+      tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen);
+    }
+    return tot;
+  }
+
+  /** Returns the popcount or cardinality of "a and not b"
+   * or "intersection(a, not(b))".
+   * Neither set is modified.
+   */
+  public static long andNotCount(OpenBitSet a, OpenBitSet b) {
+    long tot = BitUtil.pop_andnot(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
+    if (a.wlen > b.wlen) {
+      tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen);
+    }
+    return tot;
+  }
+
+ /** Returns the popcount or cardinality of the exclusive-or of the two sets.
+  * Neither set is modified.
+  */
+  public static long xorCount(OpenBitSet a, OpenBitSet b) {
+    long tot = BitUtil.pop_xor(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen));
+    if (a.wlen < b.wlen) {
+      tot += BitUtil.pop_array(b.bits, a.wlen, b.wlen-a.wlen);
+    } else if (a.wlen > b.wlen) {
+      tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen);
+    }
+    return tot;
+  }
+
+
+  /** Returns the index of the first set bit starting at the index specified.
+   *  -1 is returned if there are no more set bits.
+   */
+  public int nextSetBit(int index) {
+    int i = index>>6;
+    if (i>=wlen) return -1;
+    int subIndex = index & 0x3f;      // index within the word
+    long word = bits[i] >> subIndex;  // skip all the bits to the right of index
+
+    if (word!=0) {
+      return (i<<6) + subIndex + BitUtil.ntz(word);
+    }
+
+    while(++i < wlen) {
+      word = bits[i];
+      if (word!=0) return (i<<6) + BitUtil.ntz(word);
+    }
+
+    return -1;
+  }
+
+  /** Returns the index of the first set bit starting at the index specified.
+   *  -1 is returned if there are no more set bits.
+   */
+  public long nextSetBit(long index) {
+    int i = (int)(index>>>6);
+    if (i>=wlen) return -1;
+    int subIndex = (int)index & 0x3f; // index within the word
+    long word = bits[i] >>> subIndex;  // skip all the bits to the right of index
+
+    if (word!=0) {
+      return (((long)i)<<6) + (subIndex + BitUtil.ntz(word));
+    }
+
+    while(++i < wlen) {
+      word = bits[i];
+      if (word!=0) return (((long)i)<<6) + BitUtil.ntz(word);
+    }
+
+    return -1;
+  }
+
+
+
+
+  public Object clone() {
+    try {
+      OpenBitSet obs = (OpenBitSet)super.clone();
+      obs.bits = obs.bits.clone();  // hopefully an array clone is as fast(er) than arraycopy
+      return obs;
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /** this = this AND other */
+  public void intersect(OpenBitSet other) {
+    int newLen= Math.min(this.wlen,other.wlen);
+    long[] thisArr = this.bits;
+    long[] otherArr = other.bits;
+    // testing against zero can be more efficient
+    int pos=newLen;
+    while(--pos>=0) {
+      thisArr[pos] &= otherArr[pos];
+    }
+    if (this.wlen > newLen) {
+      // fill zeros from the new shorter length to the old length
+      Arrays.fill(bits,newLen,this.wlen,0);
+    }
+    this.wlen = newLen;
+  }
+
+  /** this = this OR other */
+  public void union(OpenBitSet other) {
+    int newLen = Math.max(wlen,other.wlen);
+    ensureCapacityWords(newLen);
+
+    long[] thisArr = this.bits;
+    long[] otherArr = other.bits;
+    int pos=Math.min(wlen,other.wlen);
+    while(--pos>=0) {
+      thisArr[pos] |= otherArr[pos];
+    }
+    if (this.wlen < newLen) {
+      System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen);
+    }
+    this.wlen = newLen;
+  }
+
+
+  /** Remove all elements set in other. this = this AND_NOT other */
+  public void remove(OpenBitSet other) {
+    int idx = Math.min(wlen,other.wlen);
+    long[] thisArr = this.bits;
+    long[] otherArr = other.bits;
+    while(--idx>=0) {
+      thisArr[idx] &= ~otherArr[idx];
+    }
+  }
+
+  /** this = this XOR other */
+  public void xor(OpenBitSet other) {
+    int newLen = Math.max(wlen,other.wlen);
+    ensureCapacityWords(newLen);
+
+    long[] thisArr = this.bits;
+    long[] otherArr = other.bits;
+    int pos=Math.min(wlen,other.wlen);
+    while(--pos>=0) {
+      thisArr[pos] ^= otherArr[pos];
+    }
+    if (this.wlen < newLen) {
+      System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen);
+    }
+    this.wlen = newLen;
+  }
+
+
+  // some BitSet compatability methods
+
+  //** see {@link intersect} */
+  public void and(OpenBitSet other) {
+    intersect(other);
+  }
+
+  //** see {@link union} */
+  public void or(OpenBitSet other) {
+    union(other);
+  }
+
+  //** see {@link andNot} */
+  public void andNot(OpenBitSet other) {
+    remove(other);
+  }
+
+  /** returns true if the sets have any elements in common */
+  public boolean intersects(OpenBitSet other) {
+    int pos = Math.min(this.wlen, other.wlen);
+    long[] thisArr = this.bits;
+    long[] otherArr = other.bits;
+    while (--pos>=0) {
+      if ((thisArr[pos] & otherArr[pos])!=0) return true;
+    }
+    return false;
+  }
+
+
+
+  /** Expand the long[] with the size given as a number of words (64 bit longs).
+   * getNumWords() is unchanged by this call.
+   */
+  public void ensureCapacityWords(int numWords) {
+    if (bits.length < numWords) {
+      long[] newBits = new long[numWords];
+      System.arraycopy(bits,0,newBits,0,wlen);
+      bits = newBits;
+    }
+  }
+
+  /** Ensure that the long[] is big enough to hold numBits, expanding it if necessary.
+   * getNumWords() is unchanged by this call.
+   */
+  public void ensureCapacity(long numBits) {
+    ensureCapacityWords(bits2words(numBits));
+  }
+
+  /** Lowers numWords, the number of words in use,
+   * by checking for trailing zero words.
+   */
+  public void trimTrailingZeros() {
+    int idx = wlen-1;
+    while (idx>=0 && bits[idx]==0) idx--;
+    wlen = idx+1;
+  }
+
+  /** returns the number of 64 bit words it would take to hold numBits */
+  public static int bits2words(long numBits) {
+   return (int)(((numBits-1)>>>6)+1);
+  }
+
+
+  /** returns true if both sets have the same bits set */
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (!(this instanceof OpenBitSet)) return false;
+    OpenBitSet a;
+    OpenBitSet b = (OpenBitSet)o;
+    // make a the larger set.
+    if (b.wlen > this.wlen) {
+      a = b; b=this;
+    } else {
+      a=this;
+    }
+
+    // check for any set bits out of the range of b
+    for (int i=a.wlen-1; i>=b.wlen; i--) {
+      if (a.bits[i]!=0) return false;
+    }
+
+    for (int i=b.wlen-1; i>=0; i--) {
+      if (a.bits[i] != b.bits[i]) return false;
+    }
+
+    return true;
+  }
+
+
+  public int hashCode() {
+	  long h = 0x98761234;  // something non-zero for length==0
+	  for (int i = bits.length; --i>=0;) {
+      h ^= bits[i];
+      h = (h << 1) | (h >>> 31); // rotate left
+    }
+    return (int)((h>>32) ^ h);  // fold leftmost bits into right
+  }
+
+}
+
+
--- a/src/test/org/apache/solr/search/TestDocSet.java
+++ b/src/test/org/apache/solr/search/TestDocSet.java
@ -0,0 +1,88 @@
+package org.apache.solr.search;
+
+import junit.framework.TestCase;
+
+import java.util.Random;
+
+import org.apache.solr.util.OpenBitSet;
+import org.apache.solr.util.BitSetIterator;
+
+/**
+ * @author yonik
+ * @version $Id$
+ */
+public class TestDocSet extends TestCase {
+  Random rand = new Random();
+
+  public OpenBitSet getRandomSet(int sz, int bitsToSet) {
+    OpenBitSet bs = new OpenBitSet(sz);
+    if (sz==0) return bs;
+    for (int i=0; i<bitsToSet; i++) {
+      bs.fastSet(rand.nextInt(sz));
+    }
+    return bs;
+  }
+
+  public DocSet getHashDocSet(OpenBitSet bs) {
+    int[] docs = new int[(int)bs.cardinality()];
+    BitSetIterator iter = new BitSetIterator(bs);
+    for (int i=0; i<docs.length; i++) {
+      docs[i] = iter.next();
+    }
+    return new HashDocSet(docs,0,docs.length);
+  }
+
+  public DocSet getBitDocSet(OpenBitSet bs) {
+    return new BitDocSet(bs);
+  }
+
+  public DocSet getDocSet(OpenBitSet bs) {
+    return rand.nextInt(2)==0 ? getHashDocSet(bs) : getBitDocSet(bs);
+  }
+
+  public void checkEqual(OpenBitSet bs, DocSet set) {
+    for (int i=0; i<bs.capacity(); i++) {
+      assertEquals(bs.get(i), set.exists(i));
+    }
+  }
+
+  protected void doSingle(int maxSize) {
+    int sz = rand.nextInt(maxSize+1);
+    int sz2 = rand.nextInt(maxSize);
+    OpenBitSet a1 = getRandomSet(sz, rand.nextInt(sz+1));
+    OpenBitSet a2 = getRandomSet(sz, rand.nextInt(sz2+1));
+
+    DocSet b1 = getDocSet(a1);
+    DocSet b2 = getDocSet(a2);
+
+    assertEquals((int)a1.cardinality(), b1.size());
+    assertEquals((int)a2.cardinality(), b2.size());
+
+    checkEqual(a1,b1);
+    checkEqual(a2,b2);
+
+    OpenBitSet a_and = (OpenBitSet)a1.clone(); a_and.and(a2);
+    OpenBitSet a_or = (OpenBitSet)a1.clone(); a_or.or(a2);
+    // OpenBitSet a_xor = (OpenBitSet)a1.clone(); a_xor.xor(a2);
+    // OpenBitSet a_andn = (OpenBitSet)a1.clone(); a_andn.andNot(a2);
+
+    checkEqual(a_and, b1.intersection(b2));
+    checkEqual(a_or, b1.union(b2));
+
+    assertEquals(a_and.cardinality(), b1.intersectionSize(b2));
+    assertEquals(a_or.cardinality(), b1.unionSize(b2));
+
+  }
+
+
+  public void doMany(int maxSz, int iter) {
+    for (int i=0; i<iter; i++) {
+      doSingle(maxSz);
+    }
+  }
+
+  public void testRandomDocSets() {
+    doMany(300, 5000);
+  }
+
+}
--- a/src/test/org/apache/solr/util/BitSetPerf.java
+++ b/src/test/org/apache/solr/util/BitSetPerf.java
@ -0,0 +1,177 @@
+package org.apache.solr.util;
+
+import java.util.Random;
+import java.util.BitSet;
+
+/** Performance tester for OpenBitSet.
+ * Use -Xbatch for more predictable results, and run tests such that the duration
+ * is at least 10 seconds for better accuracy.  Close browsers on your system (javascript
+ * or flash may be running and cause more erratic results).
+ *
+ * @author yonik
+ * @version $Id$
+ */
+public class BitSetPerf {
+  static Random rand = new Random(0);
+
+  static void randomSets(int maxSize, int bitsToSet, BitSet target1, OpenBitSet target2) {
+    for (int i=0; i<bitsToSet; i++) {
+      int idx;
+      do {
+        idx = rand.nextInt(maxSize);
+      } while (target2.getAndSet(idx));
+      target1.set(idx);
+    }
+    /***
+     int i=target1.cardinality();
+     if (i!=bitsToSet || i!=target2.cardinality()) throw new RuntimeException();
+     ***/
+  }
+
+  public static void main(String[] args) {
+    if (args.length<5) {
+      System.out.println("BitSetTest <bitSetSize> <numSets> <numBitsSet> <testName> <iter> <impl>");
+      System.out.println("  impl => open for OpenBitSet");
+    }
+    int bitSetSize = Integer.parseInt(args[0]);
+    int numSets = Integer.parseInt(args[1]);
+    int numBitsSet = Integer.parseInt(args[2]);
+    String test = args[3];
+    int iter = Integer.parseInt(args[4]);
+    String impl = args.length>5 ? args[5].intern() : "bit";
+
+    BitSet[] sets = new BitSet[numSets];
+    OpenBitSet[] osets = new OpenBitSet[numSets];
+
+    for (int i=0; i<numSets; i++) {
+      sets[i] = new BitSet(bitSetSize);
+      osets[i] = new OpenBitSet(bitSetSize);
+      randomSets(bitSetSize, numBitsSet, sets[i], osets[i]);
+    }
+
+    BitSet bs = new BitSet(bitSetSize);
+    OpenBitSet obs = new OpenBitSet(bitSetSize);
+    randomSets(bitSetSize, numBitsSet, bs, obs);
+
+
+
+    int ret=0;
+
+    long start = System.currentTimeMillis();
+
+    if ("union".equals(test)) {
+      for (int it=0; it<iter; it++) {
+        for (int i=0; i<numSets; i++) {
+          if (impl=="open") {
+            OpenBitSet other=osets[i];
+            obs.union(other);
+          } else {
+            BitSet other=sets[i];
+            bs.or(other);
+          }
+        }
+      }
+    }
+
+    if ("cardinality".equals(test)) {
+      for (int it=0; it<iter; it++) {
+        for (int i=0; i<numSets; i++) {
+          if (impl=="open") {
+            ret += osets[i].cardinality();
+          } else {
+            ret += sets[i].cardinality();
+          }
+        }
+      }
+    }
+
+    if ("get".equals(test)) {
+      for (int it=0; it<iter; it++) {
+        for (int i=0; i<numSets; i++) {
+          if (impl=="open") {
+            OpenBitSet oset = osets[i];
+            for (int k=0; k<bitSetSize; k++) if (oset.fastGet(k)) ret++;
+          } else {
+            BitSet bset = sets[i];
+            for (int k=0; k<bitSetSize; k++) if (bset.get(k)) ret++;
+          }
+        }
+      }
+    }
+
+    if ("icount".equals(test)) {
+      for (int it=0; it<iter; it++) {
+        for (int i=0; i<numSets-1; i++) {
+          if (impl=="open") {
+            OpenBitSet a=osets[i];
+            OpenBitSet b=osets[i+1];
+            ret += OpenBitSet.intersectionCount(a,b);
+          } else {
+            BitSet a=sets[i];
+            BitSet b=sets[i+1];
+            BitSet newset = (BitSet)a.clone();
+            newset.and(b);
+            ret += newset.cardinality();
+          }
+        }
+      }
+    }
+
+    if ("clone".equals(test)) {
+      for (int it=0; it<iter; it++) {
+        for (int i=0; i<numSets; i++) {
+          if (impl=="open") {
+            osets[i] = (OpenBitSet)osets[i].clone();
+          } else {
+            sets[i] = (BitSet)sets[i].clone();
+          }
+        }
+      }
+    }
+
+    if ("nextSetBit".equals(test)) {
+      for (int it=0; it<iter; it++) {
+        for (int i=0; i<numSets; i++) {
+          if (impl=="open") {
+            final OpenBitSet set = osets[i];
+            for(int next=set.nextSetBit(0); next>=0; next=set.nextSetBit(next+1)) {
+              ret += next;
+            }
+          } else {
+            final BitSet set = sets[i];
+            for(int next=set.nextSetBit(0); next>=0; next=set.nextSetBit(next+1)) {
+              ret += next;
+            }
+          }
+        }
+      }
+    }
+
+
+    if ("iterator".equals(test)) {
+      for (int it=0; it<iter; it++) {
+        for (int i=0; i<numSets; i++) {
+          if (impl=="open") {
+            final OpenBitSet set = osets[i];
+            final BitSetIterator iterator = new BitSetIterator(set);
+            for(int next=iterator.next(); next>=0; next=iterator.next()) {
+              ret += next;
+            }
+          } else {
+            final BitSet set = sets[i];
+            for(int next=set.nextSetBit(0); next>=0; next=set.nextSetBit(next+1)) {
+              ret += next;
+            }
+          }
+        }
+      }
+    }
+
+    long end = System.currentTimeMillis();
+    System.out.println("ret="+ret);
+    System.out.println("TIME="+(end-start));
+
+  }
+
+
+}
--- a/src/test/org/apache/solr/util/TestOpenBitSet.java
+++ b/src/test/org/apache/solr/util/TestOpenBitSet.java
@ -0,0 +1,143 @@
+package org.apache.solr.util;
+
+import junit.framework.TestCase;
+
+import java.util.Random;
+import java.util.BitSet;
+
+/**
+ * @author yonik
+ * @version $Id$
+ */
+public class TestOpenBitSet extends TestCase {
+  static Random rand = new Random();
+
+  void doGet(BitSet a, OpenBitSet b) {
+    int max = a.size();
+    for (int i=0; i<max; i++) {
+      if (a.get(i) != b.get(i)) {
+        fail("mismatch: BitSet=["+i+"]="+a.get(i));
+      }
+    }
+  }
+
+  void doNextSetBit(BitSet a, OpenBitSet b) {
+    int aa=-1,bb=-1;
+    do {
+      aa = a.nextSetBit(aa+1);
+      bb = b.nextSetBit(bb+1);
+      assertEquals(aa,bb);
+    } while (aa>=0);
+  }
+
+  void doIterate(BitSet a, OpenBitSet b) {
+    int aa=-1,bb=-1;
+    BitSetIterator iterator = new BitSetIterator(b);
+    do {
+      aa = a.nextSetBit(aa+1);
+      bb = iterator.next();
+      assertEquals(aa,bb);
+    } while (aa>=0);
+  }
+
+
+  void doRandomSets(int maxSize, int iter) {
+    BitSet a0=null;
+    OpenBitSet b0=null;
+
+    for (int i=0; i<iter; i++) {
+      int sz = rand.nextInt(maxSize);
+      BitSet a = new BitSet(sz);
+      OpenBitSet b = new OpenBitSet(sz);
+
+      // test the various ways of setting bits
+      if (sz>0) {
+        int nOper = rand.nextInt(sz);
+        for (int j=0; j<nOper; j++) {
+          int idx;
+          idx = rand.nextInt(sz);
+          a.set(idx);
+          b.fastSet(idx);
+          idx = rand.nextInt(sz);
+          a.clear(idx);
+          b.fastClear(idx);
+          idx = rand.nextInt(sz);
+          a.flip(idx);
+          b.fastFlip(idx);
+
+          boolean val = b.flipAndGet(idx);
+          boolean val2 = b.flipAndGet(idx);
+          assertTrue(val != val2);
+
+          val = b.getAndSet(idx);
+          assertTrue(val2 == val);
+          assertTrue(b.get(idx));
+          if (!val) b.fastClear(idx);
+          assertTrue(b.get(idx) == val);
+        }
+      }
+
+      // test that the various ways of accessing the bits are equivalent
+      doGet(a,b);
+      doNextSetBit(a,b);
+      doIterate(a,b);
+
+      // test negation
+      int fromIndex = rand.nextInt(sz+80);
+      int toIndex = fromIndex + rand.nextInt((sz>>1)+1);
+      BitSet a_not = (BitSet)a.clone(); a_not.flip(fromIndex,toIndex);
+      OpenBitSet b_not = (OpenBitSet)b.clone(); b_not.flip(fromIndex,toIndex);
+      doIterate(a,b);
+
+      if (a0 != null) {
+        assertEquals( a.equals(a0), b.equals(b0));
+
+        assertEquals(a.cardinality(), b.cardinality());
+
+        BitSet a_and = (BitSet)a.clone(); a_and.and(a0);
+        BitSet a_or = (BitSet)a.clone(); a_or.or(a0);
+        BitSet a_xor = (BitSet)a.clone(); a_xor.xor(a0);
+        BitSet a_andn = (BitSet)a.clone(); a_andn.andNot(a0);
+
+        OpenBitSet b_and = (OpenBitSet)b.clone(); assertEquals(b,b_and); b_and.and(b0);
+        OpenBitSet b_or = (OpenBitSet)b.clone(); b_or.or(b0);
+        OpenBitSet b_xor = (OpenBitSet)b.clone(); b_xor.xor(b0);
+        OpenBitSet b_andn = (OpenBitSet)b.clone(); b_andn.andNot(b0);
+
+        doIterate(a_and,b_and);
+        doIterate(a_or,b_or);
+        doIterate(a_xor,b_xor);
+        doIterate(a_andn,b_andn);
+
+        assertEquals(a_and.cardinality(), b_and.cardinality());
+        assertEquals(a_or.cardinality(), b_or.cardinality());
+        assertEquals(a_xor.cardinality(), b_xor.cardinality());
+        assertEquals(a_andn.cardinality(), b_andn.cardinality());
+
+        // test non-mutating popcounts
+        assertEquals(b_and.cardinality(), OpenBitSet.intersectionCount(b,b0));
+        assertEquals(b_or.cardinality(), OpenBitSet.unionCount(b,b0));
+        assertEquals(b_xor.cardinality(), OpenBitSet.xorCount(b,b0));
+        assertEquals(b_andn.cardinality(), OpenBitSet.andNotCount(b,b0));
+      }
+
+      a0=a;
+      b0=b;
+    }
+  }
+
+  // large enough to flush obvious bugs, small enough to run in <.5 sec as part of a
+  // larger testsuite.
+  public void testSmall() {
+    doRandomSets(1200,1000);
+  }
+
+  public void testBig() {
+    // uncomment to run a bigger test (~2 minutes).
+    // doRandomSets(2000,200000);
+  }
+
+}
+
+
+
--- a/src/test/test-files/solr/conf/solrconfig.xml
+++ b/src/test/test-files/solr/conf/solrconfig.xml
@ -118,7 +118,8 @@

    <queryResultWindowSize>10</queryResultWindowSize>

-    <HashDocSet maxSize="3000" loadFactor="0.75"/>
+    <!-- set maxSize artificially low to exercise both types of sets -->
+    <HashDocSet maxSize="3" loadFactor="0.75"/>


    <!-- boolToFilterOptimizer converts boolean clauses with zero boost