HashDocSet new hash, union, andNot: SOLR-114

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-01-21 05:46:31 +00:00
parent 2135986f8d
commit 33ad0e6975
4 changed files with 233 additions and 101 deletions

View File

@ -52,7 +52,10 @@ Changes in runtime behavior
user query, not boost or filter queries (klaas).
Optimizations
1.
1. SOLR-114: HashDocSet specific implementations of union() and andNot()
for a 20x performance improvement for those set operations, and a new
hash algorithm speeds up exists() by 10% and intersectionSize() by 8%.
(yonik)
Bug Fixes
1. SOLR-87: Parsing of synonym files did not correctly handle escaped

View File

@ -177,6 +177,31 @@ public class BitDocSet extends DocSetBase {
}
}
@Override
public DocSet andNot(DocSet other) {
OpenBitSet newbits = (OpenBitSet)(bits.clone());
if (other instanceof OpenBitSet) {
newbits.andNot(((BitDocSet)other).bits);
} else {
DocIterator iter = other.iterator();
while (iter.hasNext()) newbits.clear(iter.nextDoc());
}
return new BitDocSet(newbits);
}
@Override
public DocSet union(DocSet other) {
OpenBitSet newbits = (OpenBitSet)(bits.clone());
if (other instanceof BitDocSet) {
newbits.union(((BitDocSet)other).bits);
} else {
DocIterator iter = other.iterator();
while (iter.hasNext()) newbits.set(iter.nextDoc());
}
return new BitDocSet(newbits);
}
public long memSize() {
return (bits.getBits().length << 3) + 16;
}

View File

@ -31,7 +31,6 @@ import org.apache.solr.util.BitUtil;
* @since solr 0.9
*/
public final class HashDocSet extends DocSetBase {
// final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
/** Default load factor to use for HashDocSets. We keep track of the inverse
* since multiplication is so much faster than division. The default
* is 1.0f / 0.75f
@ -45,27 +44,27 @@ public final class HashDocSet extends DocSetBase {
// an alternative to having to init the array to EMPTY at the start is
//
private final static int EMPTY=-1;
private final int tablesize;
private final int[] table;
private final int size;
private final int mask;
/** Create a HashDocSet from a list of *unique* ids */
public HashDocSet(int[] docs, int offset, int len) {
this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
}
/** Create a HashDocSet from a list of *unique* ids */
public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
if (tsize < len * inverseLoadFactor) {
tsize <<= 1;
}
tablesize = tsize;
mask=tablesize-1;
mask=tsize-1;
table = new int[tablesize];
for (int i=0; i<tablesize; i++) table[i]=EMPTY;
table = new int[tsize];
for (int i=tsize-1; i>=0; i--) table[i]=EMPTY;
for (int i=offset; i<len; i++) {
put(docs[i]);
@ -75,104 +74,36 @@ public final class HashDocSet extends DocSetBase {
}
void put(int doc) {
table[getSlot(doc)]=doc;
}
private int getSlot(int val) {
int s,v;
s=val & mask;
v=table[s];
// check for EMPTY first since that value is more likely
if (v==EMPTY || v==val) return s;
s=rehash(val);
return s;
}
// As the size of this int hashtable is expected to be small
// (thousands at most), I did not try to keep the rehash function
// reversible (important to avoid collisions in large hash tables).
private int rehash(int val) {
int h,s,v;
final int comp=~val;
// don't left shift too far... the only bits
// that count in the answer are the ones on the right.
// We want to put more of the bits on the left
// into the answer.
// Keep small tables in mind. We may be only using
// the first 5 or 6 bits.
// on the first rehash, use complement instead of val to shift
// so we don't end up with 0 again if val==0.
h = val ^ (comp>>8);
s = h & mask;
v = table[s];
if (v==EMPTY || v==val) return s;
h ^= (v << 17) | (comp >>> 16); // this is reversible
s = h & mask;
v = table[s];
if (v==EMPTY || v==val) return s;
h ^= (h << 8) | (comp >>> 25); // this is reversible
s = h & mask;
v = table[s];
if (v==EMPTY || v==val) return s;
/**********************
// Knuth, Thomas Wang, http://www.concentric.net/~Ttwang/tech/inthash.htm
// This magic number has no common factors with 2^32, and magic/(2^32) approximates
// the golden ratio.
private static final int magic = (int)2654435761L;
h = magic*val;
s = h & mask;
v=table[s];
if (v==EMPTY || v==val) return s;
// the mult with magic should have thoroughly mixed the bits.
// add entropy to the right half from the left half.
h ^= h>>>16;
s = h & mask;
v=table[s];
if (v==EMPTY || v==val) return s;
*************************/
// linear scan now... ug.
final int start=s;
while (++s<tablesize) {
v=table[s];
if (v==EMPTY || v==val) return s;
int s = doc & mask;
while (table[s]!=EMPTY) {
// Adding an odd number to this power-of-two hash table is
// guaranteed to do a full traversal, so instead of re-hashing
// we jump straight to a "linear" traversal.
// The key is that we provide many different ways to do the
// traversal (tablesize/2) based on the last hash code (the doc).
// Rely on loop invariant code motion to eval ((doc>>7)|1) only once.
// otherwise, we would need to pull the first case out of the loop.
s = (s + ((doc>>7)|1)) & mask;
}
s=start;
while (--s>=0) {
v=table[s];
if (v==EMPTY || v==val) return s;
table[s]=doc;
}
public boolean exists(int doc) {
int s = doc & mask;
for(;;) {
int v = table[s];
if (v==EMPTY) return false;
if (v==doc) return true;
// see put() for algorithm details.
s = (s + ((doc>>7)|1)) & mask;
}
return s;
}
/**
*
* @return The number of document ids in the set.
*/
public int size() {
return size;
}
public boolean exists(int docid) {
int v = table[docid & mask];
if (v==EMPTY) return false;
else if (v==docid) return true;
else {
v = table[rehash(docid)];
if (v==docid) return true;
else return false;
}
}
public DocIterator iterator() {
return new DocIterator() {
int pos=0;
@ -180,7 +111,7 @@ public final class HashDocSet extends DocSetBase {
{ goNext(); }
public boolean hasNext() {
return pos < tablesize;
return pos < table.length;
}
public Integer next() {
@ -191,7 +122,7 @@ public final class HashDocSet extends DocSetBase {
}
void goNext() {
while (pos<tablesize && table[pos]==EMPTY) pos++;
while (pos<table.length && table[pos]==EMPTY) pos++;
}
// modify to return -1 at end of iteration?
@ -208,9 +139,8 @@ public final class HashDocSet extends DocSetBase {
};
}
public long memSize() {
return (tablesize<<2) + 20;
return (table.length<<2) + 20;
}
@Override
@ -276,6 +206,48 @@ public final class HashDocSet extends DocSetBase {
}
@Override
public DocSet andNot(DocSet other) {
int[] result = new int[size()];
int resultCount=0;
for (int i=0; i<table.length; i++) {
int id=table[i];
if (id >= 0 && !other.exists(id)) {
result[resultCount++]=id;
}
}
return new HashDocSet(result,0,resultCount);
}
@Override
public DocSet union(DocSet other) {
if (other instanceof HashDocSet) {
// set "a" to the smallest doc set
final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;
int[] result = new int[a.size()+b.size()];
int resultCount=0;
// iterate over the largest table first, adding w/o checking.
for (int i=0; i<b.table.length; i++) {
int id=b.table[i];
if (id>=0) result[resultCount++]=id;
}
// now iterate over smaller set, adding all not already in larger set.
for (int i=0; i<a.table.length; i++) {
int id=a.table[i];
if (id>=0 && !b.exists(id)) result[resultCount++]=id;
}
return new HashDocSet(result,0,resultCount);
} else {
return other.union(this);
}
}
// don't implement andNotSize() and unionSize() on purpose... they are implemented
// in BaseDocSet in terms of intersectionSize().
}

View File

@ -23,6 +23,7 @@ import java.util.Random;
import org.apache.solr.util.OpenBitSet;
import org.apache.solr.util.BitSetIterator;
import org.apache.solr.util.BitUtil;
/**
* @author yonik
@ -30,6 +31,7 @@ import org.apache.solr.util.BitSetIterator;
*/
public class TestDocSet extends TestCase {
Random rand = new Random();
float loadfactor;
public OpenBitSet getRandomSet(int sz, int bitsToSet) {
OpenBitSet bs = new OpenBitSet(sz);
@ -105,4 +107,134 @@ public class TestDocSet extends TestCase {
doMany(300, 5000);
}
public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
int n = rand.nextInt(maxSetSize);
OpenBitSet obs = new OpenBitSet(maxDoc);
int[] a = new int[n];
for (int i=0; i<n; i++) {
for(;;) {
int idx = rand.nextInt(maxDoc);
if (obs.getAndSet(idx)) continue;
a[i]=idx;
break;
}
}
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
}
public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
DocSet[] sets = new DocSet[nSets];
for (int i=0; i<nSets; i++) {
sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
}
return sets;
}
/**** needs code insertion into HashDocSet
public void testCollisions() {
loadfactor=.75f;
rand=new Random(12345); // make deterministic
int maxSetsize=4000;
int nSets=256;
int iter=1;
int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
int ret=0;
long start=System.currentTimeMillis();
for (int maxDoc : maxDocs) {
int cstart = HashDocSet.collisions;
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
for (DocSet s1 : sets) {
for (DocSet s2 : sets) {
if (s1!=s2) ret += s1.intersectionSize(s2);
}
}
int cend = HashDocSet.collisions;
System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
}
long end=System.currentTimeMillis();
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
if (ret==-1)System.out.println("wow!");
System.out.println("collisions="+HashDocSet.collisions);
}
***/
/***
public void testIntersectionSizePerformance() {
loadfactor=.75f;
rand=new Random(12345); // make deterministic
int maxSetsize=4000;
int nSets=128;
int iter=10;
int maxDoc=1000000;
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
int ret=0;
long start=System.currentTimeMillis();
for (int i=0; i<iter; i++) {
for (DocSet s1 : sets) {
for (DocSet s2 : sets) {
ret += s1.intersectionSize(s2);
}
}
}
long end=System.currentTimeMillis();
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
if (ret==-1)System.out.println("wow!");
}
public void testExistsPerformance() {
loadfactor=.75f;
rand=new Random(12345); // make deterministic
int maxSetsize=4000;
int nSets=512;
int iter=1;
int maxDoc=1000000;
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
int ret=0;
long start=System.currentTimeMillis();
for (int i=0; i<iter; i++) {
for (DocSet s1 : sets) {
for (int j=0; j<maxDoc; j++) {
ret += s1.exists(j) ? 1 :0;
}
}
}
long end=System.currentTimeMillis();
System.out.println("testExistsSizePerformance="+(end-start)+" ms");
if (ret==-1)System.out.println("wow!");
}
***/
/**** needs code insertion into HashDocSet
public void testExistsCollisions() {
loadfactor=.75f;
rand=new Random(12345); // make deterministic
int maxSetsize=4000;
int nSets=512;
int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
int ret=0;
for (int maxDoc : maxDocs) {
int mask = (BitUtil.nextHighestPowerOfTwo(maxDoc)>>1)-1;
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
int cstart = HashDocSet.collisions;
for (DocSet s1 : sets) {
for (int j=0; j<maxDocs[0]; j++) {
int idx = rand.nextInt()&mask;
ret += s1.exists(idx) ? 1 :0;
}
}
int cend = HashDocSet.collisions;
System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
}
if (ret==-1)System.out.println("wow!");
System.out.println("collisions="+HashDocSet.collisions);
}
***/
}