HashDocSet new hash, union, andNot: SOLR-114

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498246 13f79535-47bb-0310-9956-ffa450edef68
2007-01-21 05:46:31 +00:00 · 2007-01-21 05:46:31 +00:00 · 33ad0e6975
parent 2135986f8d
commit 33ad0e6975
4 changed files with 233 additions and 101 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -52,7 +52,10 @@ Changes in runtime behavior
    user query, not boost or filter queries (klaas).
 Optimizations 
- 1.
+ 1. SOLR-114: HashDocSet specific implementations of union() and andNot()
    for a 20x performance improvement for those set operations, and a new
    hash algorithm speeds up exists() by 10% and intersectionSize() by 8%.
   (yonik)
 Bug Fixes
 1. SOLR-87: Parsing of synonym files did not correctly handle escaped
--- a/src/java/org/apache/solr/search/BitDocSet.java
+++ b/src/java/org/apache/solr/search/BitDocSet.java
@ -177,6 +177,31 @@ public class BitDocSet extends DocSetBase {
    }
  }
  @Override
   public DocSet andNot(DocSet other) {
    OpenBitSet newbits = (OpenBitSet)(bits.clone());
     if (other instanceof OpenBitSet) {
       newbits.andNot(((BitDocSet)other).bits);
     } else {
       DocIterator iter = other.iterator();
       while (iter.hasNext()) newbits.clear(iter.nextDoc());
     }
     return new BitDocSet(newbits);
  }
  @Override
   public DocSet union(DocSet other) {
     OpenBitSet newbits = (OpenBitSet)(bits.clone());
     if (other instanceof BitDocSet) {
       newbits.union(((BitDocSet)other).bits);
     } else {
       DocIterator iter = other.iterator();
       while (iter.hasNext()) newbits.set(iter.nextDoc());
     }
     return new BitDocSet(newbits);
  }
  public long memSize() {
    return (bits.getBits().length << 3) + 16;
  }
--- a/src/java/org/apache/solr/search/HashDocSet.java
+++ b/src/java/org/apache/solr/search/HashDocSet.java
@ -31,7 +31,6 @@ import org.apache.solr.util.BitUtil;
 * @since solr 0.9
 */
 public final class HashDocSet extends DocSetBase {
  // final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
  /** Default load factor to use for HashDocSets.  We keep track of the inverse
   *  since multiplication is so much faster than division.  The default
   *  is 1.0f / 0.75f
@ -45,27 +44,27 @@ public final class HashDocSet extends DocSetBase {
  // an alternative to having to init the array to EMPTY at the start is
  //
  private final static int EMPTY=-1;
  private final int tablesize;
  private final int[] table;
  private final int size;
  private final int mask;
  /** Create a HashDocSet from a list of *unique* ids */
  public HashDocSet(int[] docs, int offset, int len) {
    this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
  }
  /** Create a HashDocSet from a list of *unique* ids */  
  public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
    int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
    if (tsize < len * inverseLoadFactor) {
      tsize <<= 1;
    }
-    tablesize = tsize;
+    mask=tsize-1;
    mask=tablesize-1;
-    table = new int[tablesize];
+    table = new int[tsize];
-    for (int i=0; i<tablesize; i++) table[i]=EMPTY;
+    for (int i=tsize-1; i>=0; i--) table[i]=EMPTY;
    for (int i=offset; i<len; i++) {
      put(docs[i]);
@ -75,104 +74,36 @@ public final class HashDocSet extends DocSetBase {
  }
  void put(int doc) {
-    table[getSlot(doc)]=doc;
+    int s = doc & mask;
    while (table[s]!=EMPTY) {
      // Adding an odd number to this power-of-two hash table is
      // guaranteed to do a full traversal, so instead of re-hashing
      // we jump straight to a "linear" traversal.
      // The key is that we provide many different ways to do the
      // traversal (tablesize/2) based on the last hash code (the doc).
      // Rely on loop invariant code motion to eval ((doc>>7)|1) only once.
      // otherwise, we would need to pull the first case out of the loop.
      s = (s + ((doc>>7)|1)) & mask;
    }
    table[s]=doc;
  }
-  private int getSlot(int val) {
+  public boolean exists(int doc) {
-    int s,v;
+    int s = doc & mask;
-    s=val & mask;
+    for(;;) {
-    v=table[s];
+      int v = table[s];
-    // check for EMPTY first since that value is more likely
+      if (v==EMPTY) return false;
-    if (v==EMPTY || v==val) return s;
+      if (v==doc) return true;
-    s=rehash(val);
+      // see put() for algorithm details.
-    return s;
+      s = (s + ((doc>>7)|1)) & mask;
    }
  }
  // As the size of this int hashtable is expected to be small
  // (thousands at most), I did not try to keep the rehash function
  // reversible (important to avoid collisions in large hash tables).
  private int rehash(int val) {
    int h,s,v;
    final int comp=~val;
    // don't left shift too far... the only bits
    // that count in the answer are the ones on the right.
    // We want to put more of the bits on the left
    // into the answer.
    // Keep small tables in mind.  We may be only using
    // the first 5 or 6 bits.
    // on the first rehash, use complement instead of val to shift
    // so we don't end up with 0 again if val==0.
    h = val ^ (comp>>8);
    s = h & mask;
    v = table[s];
    if (v==EMPTY || v==val) return s;
    h ^= (v << 17) | (comp >>> 16);   // this is reversible
    s = h & mask;
    v = table[s];
    if (v==EMPTY || v==val) return s;
    h ^= (h << 8) | (comp >>> 25);    // this is reversible
    s = h & mask;
    v = table[s];
    if (v==EMPTY || v==val) return s;
    /**********************
     // Knuth, Thomas Wang, http://www.concentric.net/~Ttwang/tech/inthash.htm
     // This magic number has no common factors with 2^32, and magic/(2^32) approximates
     // the golden ratio.
    private static final int magic = (int)2654435761L;
    h = magic*val;
    s = h & mask;
    v=table[s];
    if (v==EMPTY || v==val) return s;
    // the mult with magic should have thoroughly mixed the bits.
    // add entropy to the right half from the left half.
    h ^= h>>>16;
    s = h & mask;
    v=table[s];
    if (v==EMPTY || v==val) return s;
    *************************/
    // linear scan now... ug.
    final int start=s;
    while (++s<tablesize) {
      v=table[s];
      if (v==EMPTY || v==val) return s;
    }
    s=start;
    while (--s>=0) {
      v=table[s];
      if (v==EMPTY || v==val) return s;
    }
    return s;
  }
  /**
   *
   * @return The number of document ids in the set.
   */
  public int size() {
    return size;
  }
  public boolean exists(int docid) {
    int v = table[docid & mask];
    if (v==EMPTY) return false;
    else if (v==docid) return true;
    else {
      v = table[rehash(docid)];
      if (v==docid) return true;
      else return false;
    }
  }
  public DocIterator iterator() {
    return new DocIterator() {
      int pos=0;
@ -180,7 +111,7 @@ public final class HashDocSet extends DocSetBase {
      { goNext(); }
      public boolean hasNext() {
-        return pos < tablesize;
+        return pos < table.length;
      }
      public Integer next() {
@ -191,7 +122,7 @@ public final class HashDocSet extends DocSetBase {
      }
      void goNext() {
-        while (pos<tablesize && table[pos]==EMPTY) pos++;
+        while (pos<table.length && table[pos]==EMPTY) pos++;
      }
      // modify to return -1 at end of iteration?
@ -208,9 +139,8 @@ public final class HashDocSet extends DocSetBase {
    };
  }
  public long memSize() {
-    return (tablesize<<2) + 20;
+    return (table.length<<2) + 20;
  }
  @Override
@ -276,6 +206,48 @@ public final class HashDocSet extends DocSetBase {
  }
  @Override
  public DocSet andNot(DocSet other) {
    int[] result = new int[size()];
    int resultCount=0;
    for (int i=0; i<table.length; i++) {
      int id=table[i];
      if (id >= 0 && !other.exists(id)) {
        result[resultCount++]=id;
      }
    }
    return new HashDocSet(result,0,resultCount);
  }
  @Override
  public DocSet union(DocSet other) {
   if (other instanceof HashDocSet) {
     // set "a" to the smallest doc set
     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;
     int[] result = new int[a.size()+b.size()];
     int resultCount=0;
     // iterate over the largest table first, adding w/o checking.
     for (int i=0; i<b.table.length; i++) {
       int id=b.table[i];
       if (id>=0) result[resultCount++]=id;
     }
     // now iterate over smaller set, adding all not already in larger set.
     for (int i=0; i<a.table.length; i++) {
       int id=a.table[i];
       if (id>=0 && !b.exists(id)) result[resultCount++]=id;
     }
     return new HashDocSet(result,0,resultCount);
   } else {
     return other.union(this);
   }
  }
  // don't implement andNotSize() and unionSize() on purpose... they are implemented
  // in BaseDocSet in terms of intersectionSize().
 }
--- a/src/test/org/apache/solr/search/TestDocSet.java
+++ b/src/test/org/apache/solr/search/TestDocSet.java
@ -23,6 +23,7 @@ import java.util.Random;
 import org.apache.solr.util.OpenBitSet;
 import org.apache.solr.util.BitSetIterator;
 import org.apache.solr.util.BitUtil;
 /**
 * @author yonik
@ -30,6 +31,7 @@ import org.apache.solr.util.BitSetIterator;
 */
 public class TestDocSet extends TestCase {
  Random rand = new Random();
  float loadfactor;
  public OpenBitSet getRandomSet(int sz, int bitsToSet) {
    OpenBitSet bs = new OpenBitSet(sz);
@ -105,4 +107,134 @@ public class TestDocSet extends TestCase {
    doMany(300, 5000);
  }
  public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
    int n = rand.nextInt(maxSetSize);
    OpenBitSet obs = new OpenBitSet(maxDoc);
    int[] a = new int[n];
    for (int i=0; i<n; i++) {
      for(;;) {
        int idx = rand.nextInt(maxDoc);
        if (obs.getAndSet(idx)) continue;
        a[i]=idx;
        break;
      }
    }
    return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
  }
  public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
    DocSet[] sets = new DocSet[nSets];
    for (int i=0; i<nSets; i++) {
      sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
    }
    return sets;
  }
  /**** needs code insertion into HashDocSet
  public void testCollisions() {
    loadfactor=.75f;
    rand=new Random(12345);  // make deterministic
    int maxSetsize=4000;
    int nSets=256;
    int iter=1;
    int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
    int ret=0;
    long start=System.currentTimeMillis();
    for (int maxDoc : maxDocs) {
      int cstart = HashDocSet.collisions;
      DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
      for (DocSet s1 : sets) {
        for (DocSet s2 : sets) {
          if (s1!=s2) ret += s1.intersectionSize(s2);
        }
      }
      int cend = HashDocSet.collisions;
      System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));      
    }
    long end=System.currentTimeMillis();
    System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
    if (ret==-1)System.out.println("wow!");
    System.out.println("collisions="+HashDocSet.collisions);
  }
  ***/
  /***
  public void testIntersectionSizePerformance() {
    loadfactor=.75f;
    rand=new Random(12345);  // make deterministic
    int maxSetsize=4000;
    int nSets=128;
    int iter=10;
    int maxDoc=1000000;
    DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
    int ret=0;
    long start=System.currentTimeMillis();
    for (int i=0; i<iter; i++) {
      for (DocSet s1 : sets) {
        for (DocSet s2 : sets) {
          ret += s1.intersectionSize(s2);
        }
      }
    }
    long end=System.currentTimeMillis();
    System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
    if (ret==-1)System.out.println("wow!");
  }
  public void testExistsPerformance() {
    loadfactor=.75f;
    rand=new Random(12345);  // make deterministic
    int maxSetsize=4000;
    int nSets=512;
    int iter=1;
    int maxDoc=1000000;
    DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
    int ret=0;
    long start=System.currentTimeMillis();
    for (int i=0; i<iter; i++) {
      for (DocSet s1 : sets) {
        for (int j=0; j<maxDoc; j++) {
          ret += s1.exists(j) ? 1 :0;
        }
      }
    }
    long end=System.currentTimeMillis();
    System.out.println("testExistsSizePerformance="+(end-start)+" ms");
    if (ret==-1)System.out.println("wow!");
  }
   ***/
   /**** needs code insertion into HashDocSet
   public void testExistsCollisions() {
    loadfactor=.75f;
    rand=new Random(12345);  // make deterministic
    int maxSetsize=4000;
    int nSets=512;
    int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
    int ret=0;
    for (int maxDoc : maxDocs) {
      int mask = (BitUtil.nextHighestPowerOfTwo(maxDoc)>>1)-1;
      DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
      int cstart = HashDocSet.collisions;      
      for (DocSet s1 : sets) {
        for (int j=0; j<maxDocs[0]; j++) {
          int idx = rand.nextInt()&mask;
          ret += s1.exists(idx) ? 1 :0;
        }
      }
      int cend = HashDocSet.collisions;
      System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
    }
    if (ret==-1)System.out.println("wow!");
    System.out.println("collisions="+HashDocSet.collisions);
  }
  ***/
 }