HashDocSet new hash, union, andNot: SOLR-114

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498246 13f79535-47bb-0310-9956-ffa450edef68
2007-01-21 05:46:31 +00:00 · 2007-01-21 05:46:31 +00:00 · 33ad0e6975
parent 2135986f8d
commit 33ad0e6975
4 changed files with 233 additions and 101 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -52,7 +52,10 @@ Changes in runtime behavior
    user query, not boost or filter queries (klaas).

 Optimizations 
- 1.
+ 1. SOLR-114: HashDocSet specific implementations of union() and andNot()
+    for a 20x performance improvement for those set operations, and a new
+    hash algorithm speeds up exists() by 10% and intersectionSize() by 8%.
+   (yonik)

 Bug Fixes
 1. SOLR-87: Parsing of synonym files did not correctly handle escaped
--- a/src/java/org/apache/solr/search/BitDocSet.java
+++ b/src/java/org/apache/solr/search/BitDocSet.java
@ -177,6 +177,31 @@ public class BitDocSet extends DocSetBase {
    }
  }

+  @Override
+   public DocSet andNot(DocSet other) {
+    OpenBitSet newbits = (OpenBitSet)(bits.clone());
+     if (other instanceof OpenBitSet) {
+       newbits.andNot(((BitDocSet)other).bits);
+     } else {
+       DocIterator iter = other.iterator();
+       while (iter.hasNext()) newbits.clear(iter.nextDoc());
+     }
+     return new BitDocSet(newbits);
+  }
+
+  @Override
+   public DocSet union(DocSet other) {
+     OpenBitSet newbits = (OpenBitSet)(bits.clone());
+     if (other instanceof BitDocSet) {
+       newbits.union(((BitDocSet)other).bits);
+     } else {
+       DocIterator iter = other.iterator();
+       while (iter.hasNext()) newbits.set(iter.nextDoc());
+     }
+     return new BitDocSet(newbits);
+  }
+
+
  public long memSize() {
    return (bits.getBits().length << 3) + 16;
  }
--- a/src/java/org/apache/solr/search/HashDocSet.java
+++ b/src/java/org/apache/solr/search/HashDocSet.java
@ -31,7 +31,6 @@ import org.apache.solr.util.BitUtil;
 * @since solr 0.9
 */
 public final class HashDocSet extends DocSetBase {
-  // final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
  /** Default load factor to use for HashDocSets.  We keep track of the inverse
   *  since multiplication is so much faster than division.  The default
   *  is 1.0f / 0.75f
@ -45,27 +44,27 @@ public final class HashDocSet extends DocSetBase {
  // an alternative to having to init the array to EMPTY at the start is
  //
  private final static int EMPTY=-1;
-  private final int tablesize;
  private final int[] table;
  private final int size;

  private final int mask;

+  /** Create a HashDocSet from a list of *unique* ids */
  public HashDocSet(int[] docs, int offset, int len) {
    this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
  }

+  /** Create a HashDocSet from a list of *unique* ids */  
  public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
    int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
    if (tsize < len * inverseLoadFactor) {
      tsize <<= 1;
    }

-    tablesize = tsize;
-    mask=tablesize-1;
+    mask=tsize-1;

-    table = new int[tablesize];
-    for (int i=0; i<tablesize; i++) table[i]=EMPTY;
+    table = new int[tsize];
+    for (int i=tsize-1; i>=0; i--) table[i]=EMPTY;

    for (int i=offset; i<len; i++) {
      put(docs[i]);
@ -75,104 +74,36 @@ public final class HashDocSet extends DocSetBase {
  }

  void put(int doc) {
-    table[getSlot(doc)]=doc;
-  }
-
-  private int getSlot(int val) {
-    int s,v;
-    s=val & mask;
-    v=table[s];
-    // check for EMPTY first since that value is more likely
-    if (v==EMPTY || v==val) return s;
-    s=rehash(val);
-    return s;
-  }
-
-
-  // As the size of this int hashtable is expected to be small
-  // (thousands at most), I did not try to keep the rehash function
-  // reversible (important to avoid collisions in large hash tables).
-  private int rehash(int val) {
-    int h,s,v;
-    final int comp=~val;
-
-    // don't left shift too far... the only bits
-    // that count in the answer are the ones on the right.
-    // We want to put more of the bits on the left
-    // into the answer.
-    // Keep small tables in mind.  We may be only using
-    // the first 5 or 6 bits.
-
-    // on the first rehash, use complement instead of val to shift
-    // so we don't end up with 0 again if val==0.
-    h = val ^ (comp>>8);
-    s = h & mask;
-    v = table[s];
-    if (v==EMPTY || v==val) return s;
-
-    h ^= (v << 17) | (comp >>> 16);   // this is reversible
-    s = h & mask;
-    v = table[s];
-    if (v==EMPTY || v==val) return s;
-
-    h ^= (h << 8) | (comp >>> 25);    // this is reversible
-    s = h & mask;
-    v = table[s];
-    if (v==EMPTY || v==val) return s;
-
-    /**********************
-     // Knuth, Thomas Wang, http://www.concentric.net/~Ttwang/tech/inthash.htm
-     // This magic number has no common factors with 2^32, and magic/(2^32) approximates
-     // the golden ratio.
-    private static final int magic = (int)2654435761L;
-
-    h = magic*val;
-    s = h & mask;
-    v=table[s];
-    if (v==EMPTY || v==val) return s;
-
-    // the mult with magic should have thoroughly mixed the bits.
-    // add entropy to the right half from the left half.
-    h ^= h>>>16;
-    s = h & mask;
-    v=table[s];
-    if (v==EMPTY || v==val) return s;
-    *************************/
-
-    // linear scan now... ug.
-    final int start=s;
-    while (++s<tablesize) {
-      v=table[s];
-      if (v==EMPTY || v==val) return s;
+    int s = doc & mask;
+    while (table[s]!=EMPTY) {
+      // Adding an odd number to this power-of-two hash table is
+      // guaranteed to do a full traversal, so instead of re-hashing
+      // we jump straight to a "linear" traversal.
+      // The key is that we provide many different ways to do the
+      // traversal (tablesize/2) based on the last hash code (the doc).
+      // Rely on loop invariant code motion to eval ((doc>>7)|1) only once.
+      // otherwise, we would need to pull the first case out of the loop.
+      s = (s + ((doc>>7)|1)) & mask;
    }
-    s=start;
-    while (--s>=0) {
-      v=table[s];
-      if (v==EMPTY || v==val) return s;
+    table[s]=doc;
+  }
+
+  public boolean exists(int doc) {
+    int s = doc & mask;
+    for(;;) {
+      int v = table[s];
+      if (v==EMPTY) return false;
+      if (v==doc) return true;
+      // see put() for algorithm details.
+      s = (s + ((doc>>7)|1)) & mask;
    }
-    return s;
  }


-  /**
-   *
-   * @return The number of document ids in the set.
-   */
  public int size() {
    return size;
  }

-  public boolean exists(int docid) {
-    int v = table[docid & mask];
-    if (v==EMPTY) return false;
-    else if (v==docid) return true;
-    else {
-      v = table[rehash(docid)];
-      if (v==docid) return true;
-      else return false;
-    }
-  }
-
  public DocIterator iterator() {
    return new DocIterator() {
      int pos=0;
@ -180,7 +111,7 @@ public final class HashDocSet extends DocSetBase {
      { goNext(); }

      public boolean hasNext() {
-        return pos < tablesize;
+        return pos < table.length;
      }

      public Integer next() {
@ -191,7 +122,7 @@ public final class HashDocSet extends DocSetBase {
      }

      void goNext() {
-        while (pos<tablesize && table[pos]==EMPTY) pos++;
+        while (pos<table.length && table[pos]==EMPTY) pos++;
      }

      // modify to return -1 at end of iteration?
@ -208,9 +139,8 @@ public final class HashDocSet extends DocSetBase {
    };
  }

-
  public long memSize() {
-    return (tablesize<<2) + 20;
+    return (table.length<<2) + 20;
  }

  @Override
@ -276,6 +206,48 @@ public final class HashDocSet extends DocSetBase {
  }


+  @Override
+  public DocSet andNot(DocSet other) {
+    int[] result = new int[size()];
+    int resultCount=0;
+
+    for (int i=0; i<table.length; i++) {
+      int id=table[i];
+      if (id >= 0 && !other.exists(id)) {
+        result[resultCount++]=id;
+      }
+    }
+    return new HashDocSet(result,0,resultCount);
+  }
+
+  @Override
+  public DocSet union(DocSet other) {
+   if (other instanceof HashDocSet) {
+     // set "a" to the smallest doc set
+     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
+     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;
+
+     int[] result = new int[a.size()+b.size()];
+     int resultCount=0;
+     // iterate over the largest table first, adding w/o checking.
+     for (int i=0; i<b.table.length; i++) {
+       int id=b.table[i];
+       if (id>=0) result[resultCount++]=id;
+     }
+
+     // now iterate over smaller set, adding all not already in larger set.
+     for (int i=0; i<a.table.length; i++) {
+       int id=a.table[i];
+       if (id>=0 && !b.exists(id)) result[resultCount++]=id;
+     }
+
+     return new HashDocSet(result,0,resultCount);
+   } else {
+     return other.union(this);
+   }
+  }
+
+
  // don't implement andNotSize() and unionSize() on purpose... they are implemented
  // in BaseDocSet in terms of intersectionSize().
 }
--- a/src/test/org/apache/solr/search/TestDocSet.java
+++ b/src/test/org/apache/solr/search/TestDocSet.java
@ -23,6 +23,7 @@ import java.util.Random;

 import org.apache.solr.util.OpenBitSet;
 import org.apache.solr.util.BitSetIterator;
+import org.apache.solr.util.BitUtil;

 /**
 * @author yonik
@ -30,6 +31,7 @@ import org.apache.solr.util.BitSetIterator;
 */
 public class TestDocSet extends TestCase {
  Random rand = new Random();
+  float loadfactor;

  public OpenBitSet getRandomSet(int sz, int bitsToSet) {
    OpenBitSet bs = new OpenBitSet(sz);
@ -105,4 +107,134 @@ public class TestDocSet extends TestCase {
    doMany(300, 5000);
  }

+
+  public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
+    int n = rand.nextInt(maxSetSize);
+    OpenBitSet obs = new OpenBitSet(maxDoc);
+    int[] a = new int[n];
+    for (int i=0; i<n; i++) {
+      for(;;) {
+        int idx = rand.nextInt(maxDoc);
+        if (obs.getAndSet(idx)) continue;
+        a[i]=idx;
+        break;
+      }
+    }
+    return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
+  }
+
+  public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
+    DocSet[] sets = new DocSet[nSets];
+
+    for (int i=0; i<nSets; i++) {
+      sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
+    }
+
+    return sets;
+  }
+
+  /**** needs code insertion into HashDocSet
+  public void testCollisions() {
+    loadfactor=.75f;
+    rand=new Random(12345);  // make deterministic
+    int maxSetsize=4000;
+    int nSets=256;
+    int iter=1;
+    int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
+    int ret=0;
+    long start=System.currentTimeMillis();
+    for (int maxDoc : maxDocs) {
+      int cstart = HashDocSet.collisions;
+      DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
+      for (DocSet s1 : sets) {
+        for (DocSet s2 : sets) {
+          if (s1!=s2) ret += s1.intersectionSize(s2);
+        }
+      }
+      int cend = HashDocSet.collisions;
+      System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));      
+    }
+    long end=System.currentTimeMillis();
+    System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
+    if (ret==-1)System.out.println("wow!");
+    System.out.println("collisions="+HashDocSet.collisions);
+
+  }
+  ***/
+
+  /***
+  public void testIntersectionSizePerformance() {
+    loadfactor=.75f;
+    rand=new Random(12345);  // make deterministic
+    int maxSetsize=4000;
+    int nSets=128;
+    int iter=10;
+    int maxDoc=1000000;
+    DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
+    int ret=0;
+    long start=System.currentTimeMillis();
+    for (int i=0; i<iter; i++) {
+      for (DocSet s1 : sets) {
+        for (DocSet s2 : sets) {
+          ret += s1.intersectionSize(s2);
+        }
+      }
+    }
+    long end=System.currentTimeMillis();
+    System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
+    if (ret==-1)System.out.println("wow!");
+  }
+
+
+  public void testExistsPerformance() {
+    loadfactor=.75f;
+    rand=new Random(12345);  // make deterministic
+    int maxSetsize=4000;
+    int nSets=512;
+    int iter=1;
+    int maxDoc=1000000;
+    DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
+    int ret=0;
+    long start=System.currentTimeMillis();
+    for (int i=0; i<iter; i++) {
+      for (DocSet s1 : sets) {
+        for (int j=0; j<maxDoc; j++) {
+          ret += s1.exists(j) ? 1 :0;
+        }
+      }
+    }
+    long end=System.currentTimeMillis();
+    System.out.println("testExistsSizePerformance="+(end-start)+" ms");
+    if (ret==-1)System.out.println("wow!");
+  }
+   ***/
+
+   /**** needs code insertion into HashDocSet
+   public void testExistsCollisions() {
+    loadfactor=.75f;
+    rand=new Random(12345);  // make deterministic
+    int maxSetsize=4000;
+    int nSets=512;
+    int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
+    int ret=0;
+
+    for (int maxDoc : maxDocs) {
+      int mask = (BitUtil.nextHighestPowerOfTwo(maxDoc)>>1)-1;
+      DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
+      int cstart = HashDocSet.collisions;      
+      for (DocSet s1 : sets) {
+        for (int j=0; j<maxDocs[0]; j++) {
+          int idx = rand.nextInt()&mask;
+          ret += s1.exists(idx) ? 1 :0;
+        }
+      }
+      int cend = HashDocSet.collisions;
+      System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
+    }
+    if (ret==-1)System.out.println("wow!");
+    System.out.println("collisions="+HashDocSet.collisions);
+  }
+  ***/
+
+
 }