mirror of https://github.com/apache/lucene.git
HashDocSet new hash, union, andNot: SOLR-114
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2135986f8d
commit
33ad0e6975
|
@ -52,7 +52,10 @@ Changes in runtime behavior
|
|||
user query, not boost or filter queries (klaas).
|
||||
|
||||
Optimizations
|
||||
1.
|
||||
1. SOLR-114: HashDocSet specific implementations of union() and andNot()
|
||||
for a 20x performance improvement for those set operations, and a new
|
||||
hash algorithm speeds up exists() by 10% and intersectionSize() by 8%.
|
||||
(yonik)
|
||||
|
||||
Bug Fixes
|
||||
1. SOLR-87: Parsing of synonym files did not correctly handle escaped
|
||||
|
|
|
@ -177,6 +177,31 @@ public class BitDocSet extends DocSetBase {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocSet andNot(DocSet other) {
|
||||
OpenBitSet newbits = (OpenBitSet)(bits.clone());
|
||||
if (other instanceof OpenBitSet) {
|
||||
newbits.andNot(((BitDocSet)other).bits);
|
||||
} else {
|
||||
DocIterator iter = other.iterator();
|
||||
while (iter.hasNext()) newbits.clear(iter.nextDoc());
|
||||
}
|
||||
return new BitDocSet(newbits);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocSet union(DocSet other) {
|
||||
OpenBitSet newbits = (OpenBitSet)(bits.clone());
|
||||
if (other instanceof BitDocSet) {
|
||||
newbits.union(((BitDocSet)other).bits);
|
||||
} else {
|
||||
DocIterator iter = other.iterator();
|
||||
while (iter.hasNext()) newbits.set(iter.nextDoc());
|
||||
}
|
||||
return new BitDocSet(newbits);
|
||||
}
|
||||
|
||||
|
||||
public long memSize() {
|
||||
return (bits.getBits().length << 3) + 16;
|
||||
}
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.solr.util.BitUtil;
|
|||
* @since solr 0.9
|
||||
*/
|
||||
public final class HashDocSet extends DocSetBase {
|
||||
// final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
|
||||
/** Default load factor to use for HashDocSets. We keep track of the inverse
|
||||
* since multiplication is so much faster than division. The default
|
||||
* is 1.0f / 0.75f
|
||||
|
@ -45,27 +44,27 @@ public final class HashDocSet extends DocSetBase {
|
|||
// an alternative to having to init the array to EMPTY at the start is
|
||||
//
|
||||
private final static int EMPTY=-1;
|
||||
private final int tablesize;
|
||||
private final int[] table;
|
||||
private final int size;
|
||||
|
||||
private final int mask;
|
||||
|
||||
/** Create a HashDocSet from a list of *unique* ids */
|
||||
public HashDocSet(int[] docs, int offset, int len) {
|
||||
this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
|
||||
}
|
||||
|
||||
/** Create a HashDocSet from a list of *unique* ids */
|
||||
public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
|
||||
int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
|
||||
if (tsize < len * inverseLoadFactor) {
|
||||
tsize <<= 1;
|
||||
}
|
||||
|
||||
tablesize = tsize;
|
||||
mask=tablesize-1;
|
||||
mask=tsize-1;
|
||||
|
||||
table = new int[tablesize];
|
||||
for (int i=0; i<tablesize; i++) table[i]=EMPTY;
|
||||
table = new int[tsize];
|
||||
for (int i=tsize-1; i>=0; i--) table[i]=EMPTY;
|
||||
|
||||
for (int i=offset; i<len; i++) {
|
||||
put(docs[i]);
|
||||
|
@ -75,104 +74,36 @@ public final class HashDocSet extends DocSetBase {
|
|||
}
|
||||
|
||||
void put(int doc) {
|
||||
table[getSlot(doc)]=doc;
|
||||
}
|
||||
|
||||
private int getSlot(int val) {
|
||||
int s,v;
|
||||
s=val & mask;
|
||||
v=table[s];
|
||||
// check for EMPTY first since that value is more likely
|
||||
if (v==EMPTY || v==val) return s;
|
||||
s=rehash(val);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
// As the size of this int hashtable is expected to be small
|
||||
// (thousands at most), I did not try to keep the rehash function
|
||||
// reversible (important to avoid collisions in large hash tables).
|
||||
private int rehash(int val) {
|
||||
int h,s,v;
|
||||
final int comp=~val;
|
||||
|
||||
// don't left shift too far... the only bits
|
||||
// that count in the answer are the ones on the right.
|
||||
// We want to put more of the bits on the left
|
||||
// into the answer.
|
||||
// Keep small tables in mind. We may be only using
|
||||
// the first 5 or 6 bits.
|
||||
|
||||
// on the first rehash, use complement instead of val to shift
|
||||
// so we don't end up with 0 again if val==0.
|
||||
h = val ^ (comp>>8);
|
||||
s = h & mask;
|
||||
v = table[s];
|
||||
if (v==EMPTY || v==val) return s;
|
||||
|
||||
h ^= (v << 17) | (comp >>> 16); // this is reversible
|
||||
s = h & mask;
|
||||
v = table[s];
|
||||
if (v==EMPTY || v==val) return s;
|
||||
|
||||
h ^= (h << 8) | (comp >>> 25); // this is reversible
|
||||
s = h & mask;
|
||||
v = table[s];
|
||||
if (v==EMPTY || v==val) return s;
|
||||
|
||||
/**********************
|
||||
// Knuth, Thomas Wang, http://www.concentric.net/~Ttwang/tech/inthash.htm
|
||||
// This magic number has no common factors with 2^32, and magic/(2^32) approximates
|
||||
// the golden ratio.
|
||||
private static final int magic = (int)2654435761L;
|
||||
|
||||
h = magic*val;
|
||||
s = h & mask;
|
||||
v=table[s];
|
||||
if (v==EMPTY || v==val) return s;
|
||||
|
||||
// the mult with magic should have thoroughly mixed the bits.
|
||||
// add entropy to the right half from the left half.
|
||||
h ^= h>>>16;
|
||||
s = h & mask;
|
||||
v=table[s];
|
||||
if (v==EMPTY || v==val) return s;
|
||||
*************************/
|
||||
|
||||
// linear scan now... ug.
|
||||
final int start=s;
|
||||
while (++s<tablesize) {
|
||||
v=table[s];
|
||||
if (v==EMPTY || v==val) return s;
|
||||
int s = doc & mask;
|
||||
while (table[s]!=EMPTY) {
|
||||
// Adding an odd number to this power-of-two hash table is
|
||||
// guaranteed to do a full traversal, so instead of re-hashing
|
||||
// we jump straight to a "linear" traversal.
|
||||
// The key is that we provide many different ways to do the
|
||||
// traversal (tablesize/2) based on the last hash code (the doc).
|
||||
// Rely on loop invariant code motion to eval ((doc>>7)|1) only once.
|
||||
// otherwise, we would need to pull the first case out of the loop.
|
||||
s = (s + ((doc>>7)|1)) & mask;
|
||||
}
|
||||
s=start;
|
||||
while (--s>=0) {
|
||||
v=table[s];
|
||||
if (v==EMPTY || v==val) return s;
|
||||
table[s]=doc;
|
||||
}
|
||||
|
||||
public boolean exists(int doc) {
|
||||
int s = doc & mask;
|
||||
for(;;) {
|
||||
int v = table[s];
|
||||
if (v==EMPTY) return false;
|
||||
if (v==doc) return true;
|
||||
// see put() for algorithm details.
|
||||
s = (s + ((doc>>7)|1)) & mask;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @return The number of document ids in the set.
|
||||
*/
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public boolean exists(int docid) {
|
||||
int v = table[docid & mask];
|
||||
if (v==EMPTY) return false;
|
||||
else if (v==docid) return true;
|
||||
else {
|
||||
v = table[rehash(docid)];
|
||||
if (v==docid) return true;
|
||||
else return false;
|
||||
}
|
||||
}
|
||||
|
||||
public DocIterator iterator() {
|
||||
return new DocIterator() {
|
||||
int pos=0;
|
||||
|
@ -180,7 +111,7 @@ public final class HashDocSet extends DocSetBase {
|
|||
{ goNext(); }
|
||||
|
||||
public boolean hasNext() {
|
||||
return pos < tablesize;
|
||||
return pos < table.length;
|
||||
}
|
||||
|
||||
public Integer next() {
|
||||
|
@ -191,7 +122,7 @@ public final class HashDocSet extends DocSetBase {
|
|||
}
|
||||
|
||||
void goNext() {
|
||||
while (pos<tablesize && table[pos]==EMPTY) pos++;
|
||||
while (pos<table.length && table[pos]==EMPTY) pos++;
|
||||
}
|
||||
|
||||
// modify to return -1 at end of iteration?
|
||||
|
@ -208,9 +139,8 @@ public final class HashDocSet extends DocSetBase {
|
|||
};
|
||||
}
|
||||
|
||||
|
||||
public long memSize() {
|
||||
return (tablesize<<2) + 20;
|
||||
return (table.length<<2) + 20;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -276,6 +206,48 @@ public final class HashDocSet extends DocSetBase {
|
|||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocSet andNot(DocSet other) {
|
||||
int[] result = new int[size()];
|
||||
int resultCount=0;
|
||||
|
||||
for (int i=0; i<table.length; i++) {
|
||||
int id=table[i];
|
||||
if (id >= 0 && !other.exists(id)) {
|
||||
result[resultCount++]=id;
|
||||
}
|
||||
}
|
||||
return new HashDocSet(result,0,resultCount);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocSet union(DocSet other) {
|
||||
if (other instanceof HashDocSet) {
|
||||
// set "a" to the smallest doc set
|
||||
final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
|
||||
final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;
|
||||
|
||||
int[] result = new int[a.size()+b.size()];
|
||||
int resultCount=0;
|
||||
// iterate over the largest table first, adding w/o checking.
|
||||
for (int i=0; i<b.table.length; i++) {
|
||||
int id=b.table[i];
|
||||
if (id>=0) result[resultCount++]=id;
|
||||
}
|
||||
|
||||
// now iterate over smaller set, adding all not already in larger set.
|
||||
for (int i=0; i<a.table.length; i++) {
|
||||
int id=a.table[i];
|
||||
if (id>=0 && !b.exists(id)) result[resultCount++]=id;
|
||||
}
|
||||
|
||||
return new HashDocSet(result,0,resultCount);
|
||||
} else {
|
||||
return other.union(this);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// don't implement andNotSize() and unionSize() on purpose... they are implemented
|
||||
// in BaseDocSet in terms of intersectionSize().
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Random;
|
|||
|
||||
import org.apache.solr.util.OpenBitSet;
|
||||
import org.apache.solr.util.BitSetIterator;
|
||||
import org.apache.solr.util.BitUtil;
|
||||
|
||||
/**
|
||||
* @author yonik
|
||||
|
@ -30,6 +31,7 @@ import org.apache.solr.util.BitSetIterator;
|
|||
*/
|
||||
public class TestDocSet extends TestCase {
|
||||
Random rand = new Random();
|
||||
float loadfactor;
|
||||
|
||||
public OpenBitSet getRandomSet(int sz, int bitsToSet) {
|
||||
OpenBitSet bs = new OpenBitSet(sz);
|
||||
|
@ -105,4 +107,134 @@ public class TestDocSet extends TestCase {
|
|||
doMany(300, 5000);
|
||||
}
|
||||
|
||||
|
||||
public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
|
||||
int n = rand.nextInt(maxSetSize);
|
||||
OpenBitSet obs = new OpenBitSet(maxDoc);
|
||||
int[] a = new int[n];
|
||||
for (int i=0; i<n; i++) {
|
||||
for(;;) {
|
||||
int idx = rand.nextInt(maxDoc);
|
||||
if (obs.getAndSet(idx)) continue;
|
||||
a[i]=idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
|
||||
}
|
||||
|
||||
public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
|
||||
DocSet[] sets = new DocSet[nSets];
|
||||
|
||||
for (int i=0; i<nSets; i++) {
|
||||
sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
|
||||
}
|
||||
|
||||
return sets;
|
||||
}
|
||||
|
||||
/**** needs code insertion into HashDocSet
|
||||
public void testCollisions() {
|
||||
loadfactor=.75f;
|
||||
rand=new Random(12345); // make deterministic
|
||||
int maxSetsize=4000;
|
||||
int nSets=256;
|
||||
int iter=1;
|
||||
int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
|
||||
int ret=0;
|
||||
long start=System.currentTimeMillis();
|
||||
for (int maxDoc : maxDocs) {
|
||||
int cstart = HashDocSet.collisions;
|
||||
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||
for (DocSet s1 : sets) {
|
||||
for (DocSet s2 : sets) {
|
||||
if (s1!=s2) ret += s1.intersectionSize(s2);
|
||||
}
|
||||
}
|
||||
int cend = HashDocSet.collisions;
|
||||
System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
|
||||
}
|
||||
long end=System.currentTimeMillis();
|
||||
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
|
||||
if (ret==-1)System.out.println("wow!");
|
||||
System.out.println("collisions="+HashDocSet.collisions);
|
||||
|
||||
}
|
||||
***/
|
||||
|
||||
/***
|
||||
public void testIntersectionSizePerformance() {
|
||||
loadfactor=.75f;
|
||||
rand=new Random(12345); // make deterministic
|
||||
int maxSetsize=4000;
|
||||
int nSets=128;
|
||||
int iter=10;
|
||||
int maxDoc=1000000;
|
||||
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||
int ret=0;
|
||||
long start=System.currentTimeMillis();
|
||||
for (int i=0; i<iter; i++) {
|
||||
for (DocSet s1 : sets) {
|
||||
for (DocSet s2 : sets) {
|
||||
ret += s1.intersectionSize(s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
long end=System.currentTimeMillis();
|
||||
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
|
||||
if (ret==-1)System.out.println("wow!");
|
||||
}
|
||||
|
||||
|
||||
public void testExistsPerformance() {
|
||||
loadfactor=.75f;
|
||||
rand=new Random(12345); // make deterministic
|
||||
int maxSetsize=4000;
|
||||
int nSets=512;
|
||||
int iter=1;
|
||||
int maxDoc=1000000;
|
||||
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||
int ret=0;
|
||||
long start=System.currentTimeMillis();
|
||||
for (int i=0; i<iter; i++) {
|
||||
for (DocSet s1 : sets) {
|
||||
for (int j=0; j<maxDoc; j++) {
|
||||
ret += s1.exists(j) ? 1 :0;
|
||||
}
|
||||
}
|
||||
}
|
||||
long end=System.currentTimeMillis();
|
||||
System.out.println("testExistsSizePerformance="+(end-start)+" ms");
|
||||
if (ret==-1)System.out.println("wow!");
|
||||
}
|
||||
***/
|
||||
|
||||
/**** needs code insertion into HashDocSet
|
||||
public void testExistsCollisions() {
|
||||
loadfactor=.75f;
|
||||
rand=new Random(12345); // make deterministic
|
||||
int maxSetsize=4000;
|
||||
int nSets=512;
|
||||
int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
|
||||
int ret=0;
|
||||
|
||||
for (int maxDoc : maxDocs) {
|
||||
int mask = (BitUtil.nextHighestPowerOfTwo(maxDoc)>>1)-1;
|
||||
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||
int cstart = HashDocSet.collisions;
|
||||
for (DocSet s1 : sets) {
|
||||
for (int j=0; j<maxDocs[0]; j++) {
|
||||
int idx = rand.nextInt()&mask;
|
||||
ret += s1.exists(idx) ? 1 :0;
|
||||
}
|
||||
}
|
||||
int cend = HashDocSet.collisions;
|
||||
System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
|
||||
}
|
||||
if (ret==-1)System.out.println("wow!");
|
||||
System.out.println("collisions="+HashDocSet.collisions);
|
||||
}
|
||||
***/
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue