mirror of https://github.com/apache/lucene.git
HashDocSet new hash, union, andNot: SOLR-114
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2135986f8d
commit
33ad0e6975
|
@ -52,7 +52,10 @@ Changes in runtime behavior
|
||||||
user query, not boost or filter queries (klaas).
|
user query, not boost or filter queries (klaas).
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
1.
|
1. SOLR-114: HashDocSet specific implementations of union() and andNot()
|
||||||
|
for a 20x performance improvement for those set operations, and a new
|
||||||
|
hash algorithm speeds up exists() by 10% and intersectionSize() by 8%.
|
||||||
|
(yonik)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
1. SOLR-87: Parsing of synonym files did not correctly handle escaped
|
1. SOLR-87: Parsing of synonym files did not correctly handle escaped
|
||||||
|
|
|
@ -177,6 +177,31 @@ public class BitDocSet extends DocSetBase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocSet andNot(DocSet other) {
|
||||||
|
OpenBitSet newbits = (OpenBitSet)(bits.clone());
|
||||||
|
if (other instanceof OpenBitSet) {
|
||||||
|
newbits.andNot(((BitDocSet)other).bits);
|
||||||
|
} else {
|
||||||
|
DocIterator iter = other.iterator();
|
||||||
|
while (iter.hasNext()) newbits.clear(iter.nextDoc());
|
||||||
|
}
|
||||||
|
return new BitDocSet(newbits);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocSet union(DocSet other) {
|
||||||
|
OpenBitSet newbits = (OpenBitSet)(bits.clone());
|
||||||
|
if (other instanceof BitDocSet) {
|
||||||
|
newbits.union(((BitDocSet)other).bits);
|
||||||
|
} else {
|
||||||
|
DocIterator iter = other.iterator();
|
||||||
|
while (iter.hasNext()) newbits.set(iter.nextDoc());
|
||||||
|
}
|
||||||
|
return new BitDocSet(newbits);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public long memSize() {
|
public long memSize() {
|
||||||
return (bits.getBits().length << 3) + 16;
|
return (bits.getBits().length << 3) + 16;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.solr.util.BitUtil;
|
||||||
* @since solr 0.9
|
* @since solr 0.9
|
||||||
*/
|
*/
|
||||||
public final class HashDocSet extends DocSetBase {
|
public final class HashDocSet extends DocSetBase {
|
||||||
// final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f);
|
|
||||||
/** Default load factor to use for HashDocSets. We keep track of the inverse
|
/** Default load factor to use for HashDocSets. We keep track of the inverse
|
||||||
* since multiplication is so much faster than division. The default
|
* since multiplication is so much faster than division. The default
|
||||||
* is 1.0f / 0.75f
|
* is 1.0f / 0.75f
|
||||||
|
@ -45,27 +44,27 @@ public final class HashDocSet extends DocSetBase {
|
||||||
// an alternative to having to init the array to EMPTY at the start is
|
// an alternative to having to init the array to EMPTY at the start is
|
||||||
//
|
//
|
||||||
private final static int EMPTY=-1;
|
private final static int EMPTY=-1;
|
||||||
private final int tablesize;
|
|
||||||
private final int[] table;
|
private final int[] table;
|
||||||
private final int size;
|
private final int size;
|
||||||
|
|
||||||
private final int mask;
|
private final int mask;
|
||||||
|
|
||||||
|
/** Create a HashDocSet from a list of *unique* ids */
|
||||||
public HashDocSet(int[] docs, int offset, int len) {
|
public HashDocSet(int[] docs, int offset, int len) {
|
||||||
this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
|
this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Create a HashDocSet from a list of *unique* ids */
|
||||||
public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
|
public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
|
||||||
int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
|
int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
|
||||||
if (tsize < len * inverseLoadFactor) {
|
if (tsize < len * inverseLoadFactor) {
|
||||||
tsize <<= 1;
|
tsize <<= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
tablesize = tsize;
|
mask=tsize-1;
|
||||||
mask=tablesize-1;
|
|
||||||
|
|
||||||
table = new int[tablesize];
|
table = new int[tsize];
|
||||||
for (int i=0; i<tablesize; i++) table[i]=EMPTY;
|
for (int i=tsize-1; i>=0; i--) table[i]=EMPTY;
|
||||||
|
|
||||||
for (int i=offset; i<len; i++) {
|
for (int i=offset; i<len; i++) {
|
||||||
put(docs[i]);
|
put(docs[i]);
|
||||||
|
@ -75,104 +74,36 @@ public final class HashDocSet extends DocSetBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
void put(int doc) {
|
void put(int doc) {
|
||||||
table[getSlot(doc)]=doc;
|
int s = doc & mask;
|
||||||
|
while (table[s]!=EMPTY) {
|
||||||
|
// Adding an odd number to this power-of-two hash table is
|
||||||
|
// guaranteed to do a full traversal, so instead of re-hashing
|
||||||
|
// we jump straight to a "linear" traversal.
|
||||||
|
// The key is that we provide many different ways to do the
|
||||||
|
// traversal (tablesize/2) based on the last hash code (the doc).
|
||||||
|
// Rely on loop invariant code motion to eval ((doc>>7)|1) only once.
|
||||||
|
// otherwise, we would need to pull the first case out of the loop.
|
||||||
|
s = (s + ((doc>>7)|1)) & mask;
|
||||||
|
}
|
||||||
|
table[s]=doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int getSlot(int val) {
|
public boolean exists(int doc) {
|
||||||
int s,v;
|
int s = doc & mask;
|
||||||
s=val & mask;
|
for(;;) {
|
||||||
v=table[s];
|
int v = table[s];
|
||||||
// check for EMPTY first since that value is more likely
|
if (v==EMPTY) return false;
|
||||||
if (v==EMPTY || v==val) return s;
|
if (v==doc) return true;
|
||||||
s=rehash(val);
|
// see put() for algorithm details.
|
||||||
return s;
|
s = (s + ((doc>>7)|1)) & mask;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// As the size of this int hashtable is expected to be small
|
|
||||||
// (thousands at most), I did not try to keep the rehash function
|
|
||||||
// reversible (important to avoid collisions in large hash tables).
|
|
||||||
private int rehash(int val) {
|
|
||||||
int h,s,v;
|
|
||||||
final int comp=~val;
|
|
||||||
|
|
||||||
// don't left shift too far... the only bits
|
|
||||||
// that count in the answer are the ones on the right.
|
|
||||||
// We want to put more of the bits on the left
|
|
||||||
// into the answer.
|
|
||||||
// Keep small tables in mind. We may be only using
|
|
||||||
// the first 5 or 6 bits.
|
|
||||||
|
|
||||||
// on the first rehash, use complement instead of val to shift
|
|
||||||
// so we don't end up with 0 again if val==0.
|
|
||||||
h = val ^ (comp>>8);
|
|
||||||
s = h & mask;
|
|
||||||
v = table[s];
|
|
||||||
if (v==EMPTY || v==val) return s;
|
|
||||||
|
|
||||||
h ^= (v << 17) | (comp >>> 16); // this is reversible
|
|
||||||
s = h & mask;
|
|
||||||
v = table[s];
|
|
||||||
if (v==EMPTY || v==val) return s;
|
|
||||||
|
|
||||||
h ^= (h << 8) | (comp >>> 25); // this is reversible
|
|
||||||
s = h & mask;
|
|
||||||
v = table[s];
|
|
||||||
if (v==EMPTY || v==val) return s;
|
|
||||||
|
|
||||||
/**********************
|
|
||||||
// Knuth, Thomas Wang, http://www.concentric.net/~Ttwang/tech/inthash.htm
|
|
||||||
// This magic number has no common factors with 2^32, and magic/(2^32) approximates
|
|
||||||
// the golden ratio.
|
|
||||||
private static final int magic = (int)2654435761L;
|
|
||||||
|
|
||||||
h = magic*val;
|
|
||||||
s = h & mask;
|
|
||||||
v=table[s];
|
|
||||||
if (v==EMPTY || v==val) return s;
|
|
||||||
|
|
||||||
// the mult with magic should have thoroughly mixed the bits.
|
|
||||||
// add entropy to the right half from the left half.
|
|
||||||
h ^= h>>>16;
|
|
||||||
s = h & mask;
|
|
||||||
v=table[s];
|
|
||||||
if (v==EMPTY || v==val) return s;
|
|
||||||
*************************/
|
|
||||||
|
|
||||||
// linear scan now... ug.
|
|
||||||
final int start=s;
|
|
||||||
while (++s<tablesize) {
|
|
||||||
v=table[s];
|
|
||||||
if (v==EMPTY || v==val) return s;
|
|
||||||
}
|
|
||||||
s=start;
|
|
||||||
while (--s>=0) {
|
|
||||||
v=table[s];
|
|
||||||
if (v==EMPTY || v==val) return s;
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return The number of document ids in the set.
|
|
||||||
*/
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean exists(int docid) {
|
|
||||||
int v = table[docid & mask];
|
|
||||||
if (v==EMPTY) return false;
|
|
||||||
else if (v==docid) return true;
|
|
||||||
else {
|
|
||||||
v = table[rehash(docid)];
|
|
||||||
if (v==docid) return true;
|
|
||||||
else return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public DocIterator iterator() {
|
public DocIterator iterator() {
|
||||||
return new DocIterator() {
|
return new DocIterator() {
|
||||||
int pos=0;
|
int pos=0;
|
||||||
|
@ -180,7 +111,7 @@ public final class HashDocSet extends DocSetBase {
|
||||||
{ goNext(); }
|
{ goNext(); }
|
||||||
|
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
return pos < tablesize;
|
return pos < table.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Integer next() {
|
public Integer next() {
|
||||||
|
@ -191,7 +122,7 @@ public final class HashDocSet extends DocSetBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
void goNext() {
|
void goNext() {
|
||||||
while (pos<tablesize && table[pos]==EMPTY) pos++;
|
while (pos<table.length && table[pos]==EMPTY) pos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// modify to return -1 at end of iteration?
|
// modify to return -1 at end of iteration?
|
||||||
|
@ -208,9 +139,8 @@ public final class HashDocSet extends DocSetBase {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long memSize() {
|
public long memSize() {
|
||||||
return (tablesize<<2) + 20;
|
return (table.length<<2) + 20;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -276,6 +206,48 @@ public final class HashDocSet extends DocSetBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocSet andNot(DocSet other) {
|
||||||
|
int[] result = new int[size()];
|
||||||
|
int resultCount=0;
|
||||||
|
|
||||||
|
for (int i=0; i<table.length; i++) {
|
||||||
|
int id=table[i];
|
||||||
|
if (id >= 0 && !other.exists(id)) {
|
||||||
|
result[resultCount++]=id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new HashDocSet(result,0,resultCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocSet union(DocSet other) {
|
||||||
|
if (other instanceof HashDocSet) {
|
||||||
|
// set "a" to the smallest doc set
|
||||||
|
final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
|
||||||
|
final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;
|
||||||
|
|
||||||
|
int[] result = new int[a.size()+b.size()];
|
||||||
|
int resultCount=0;
|
||||||
|
// iterate over the largest table first, adding w/o checking.
|
||||||
|
for (int i=0; i<b.table.length; i++) {
|
||||||
|
int id=b.table[i];
|
||||||
|
if (id>=0) result[resultCount++]=id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// now iterate over smaller set, adding all not already in larger set.
|
||||||
|
for (int i=0; i<a.table.length; i++) {
|
||||||
|
int id=a.table[i];
|
||||||
|
if (id>=0 && !b.exists(id)) result[resultCount++]=id;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new HashDocSet(result,0,resultCount);
|
||||||
|
} else {
|
||||||
|
return other.union(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// don't implement andNotSize() and unionSize() on purpose... they are implemented
|
// don't implement andNotSize() and unionSize() on purpose... they are implemented
|
||||||
// in BaseDocSet in terms of intersectionSize().
|
// in BaseDocSet in terms of intersectionSize().
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Random;
|
||||||
|
|
||||||
import org.apache.solr.util.OpenBitSet;
|
import org.apache.solr.util.OpenBitSet;
|
||||||
import org.apache.solr.util.BitSetIterator;
|
import org.apache.solr.util.BitSetIterator;
|
||||||
|
import org.apache.solr.util.BitUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author yonik
|
* @author yonik
|
||||||
|
@ -30,6 +31,7 @@ import org.apache.solr.util.BitSetIterator;
|
||||||
*/
|
*/
|
||||||
public class TestDocSet extends TestCase {
|
public class TestDocSet extends TestCase {
|
||||||
Random rand = new Random();
|
Random rand = new Random();
|
||||||
|
float loadfactor;
|
||||||
|
|
||||||
public OpenBitSet getRandomSet(int sz, int bitsToSet) {
|
public OpenBitSet getRandomSet(int sz, int bitsToSet) {
|
||||||
OpenBitSet bs = new OpenBitSet(sz);
|
OpenBitSet bs = new OpenBitSet(sz);
|
||||||
|
@ -105,4 +107,134 @@ public class TestDocSet extends TestCase {
|
||||||
doMany(300, 5000);
|
doMany(300, 5000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
|
||||||
|
int n = rand.nextInt(maxSetSize);
|
||||||
|
OpenBitSet obs = new OpenBitSet(maxDoc);
|
||||||
|
int[] a = new int[n];
|
||||||
|
for (int i=0; i<n; i++) {
|
||||||
|
for(;;) {
|
||||||
|
int idx = rand.nextInt(maxDoc);
|
||||||
|
if (obs.getAndSet(idx)) continue;
|
||||||
|
a[i]=idx;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
|
||||||
|
DocSet[] sets = new DocSet[nSets];
|
||||||
|
|
||||||
|
for (int i=0; i<nSets; i++) {
|
||||||
|
sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sets;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**** needs code insertion into HashDocSet
|
||||||
|
public void testCollisions() {
|
||||||
|
loadfactor=.75f;
|
||||||
|
rand=new Random(12345); // make deterministic
|
||||||
|
int maxSetsize=4000;
|
||||||
|
int nSets=256;
|
||||||
|
int iter=1;
|
||||||
|
int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
|
||||||
|
int ret=0;
|
||||||
|
long start=System.currentTimeMillis();
|
||||||
|
for (int maxDoc : maxDocs) {
|
||||||
|
int cstart = HashDocSet.collisions;
|
||||||
|
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||||
|
for (DocSet s1 : sets) {
|
||||||
|
for (DocSet s2 : sets) {
|
||||||
|
if (s1!=s2) ret += s1.intersectionSize(s2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int cend = HashDocSet.collisions;
|
||||||
|
System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
|
||||||
|
}
|
||||||
|
long end=System.currentTimeMillis();
|
||||||
|
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
|
||||||
|
if (ret==-1)System.out.println("wow!");
|
||||||
|
System.out.println("collisions="+HashDocSet.collisions);
|
||||||
|
|
||||||
|
}
|
||||||
|
***/
|
||||||
|
|
||||||
|
/***
|
||||||
|
public void testIntersectionSizePerformance() {
|
||||||
|
loadfactor=.75f;
|
||||||
|
rand=new Random(12345); // make deterministic
|
||||||
|
int maxSetsize=4000;
|
||||||
|
int nSets=128;
|
||||||
|
int iter=10;
|
||||||
|
int maxDoc=1000000;
|
||||||
|
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||||
|
int ret=0;
|
||||||
|
long start=System.currentTimeMillis();
|
||||||
|
for (int i=0; i<iter; i++) {
|
||||||
|
for (DocSet s1 : sets) {
|
||||||
|
for (DocSet s2 : sets) {
|
||||||
|
ret += s1.intersectionSize(s2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
long end=System.currentTimeMillis();
|
||||||
|
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
|
||||||
|
if (ret==-1)System.out.println("wow!");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testExistsPerformance() {
|
||||||
|
loadfactor=.75f;
|
||||||
|
rand=new Random(12345); // make deterministic
|
||||||
|
int maxSetsize=4000;
|
||||||
|
int nSets=512;
|
||||||
|
int iter=1;
|
||||||
|
int maxDoc=1000000;
|
||||||
|
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||||
|
int ret=0;
|
||||||
|
long start=System.currentTimeMillis();
|
||||||
|
for (int i=0; i<iter; i++) {
|
||||||
|
for (DocSet s1 : sets) {
|
||||||
|
for (int j=0; j<maxDoc; j++) {
|
||||||
|
ret += s1.exists(j) ? 1 :0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
long end=System.currentTimeMillis();
|
||||||
|
System.out.println("testExistsSizePerformance="+(end-start)+" ms");
|
||||||
|
if (ret==-1)System.out.println("wow!");
|
||||||
|
}
|
||||||
|
***/
|
||||||
|
|
||||||
|
/**** needs code insertion into HashDocSet
|
||||||
|
public void testExistsCollisions() {
|
||||||
|
loadfactor=.75f;
|
||||||
|
rand=new Random(12345); // make deterministic
|
||||||
|
int maxSetsize=4000;
|
||||||
|
int nSets=512;
|
||||||
|
int[] maxDocs=new int[] {100000,500000,1000000,5000000,10000000};
|
||||||
|
int ret=0;
|
||||||
|
|
||||||
|
for (int maxDoc : maxDocs) {
|
||||||
|
int mask = (BitUtil.nextHighestPowerOfTwo(maxDoc)>>1)-1;
|
||||||
|
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||||
|
int cstart = HashDocSet.collisions;
|
||||||
|
for (DocSet s1 : sets) {
|
||||||
|
for (int j=0; j<maxDocs[0]; j++) {
|
||||||
|
int idx = rand.nextInt()&mask;
|
||||||
|
ret += s1.exists(idx) ? 1 :0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int cend = HashDocSet.collisions;
|
||||||
|
System.out.println("maxDoc="+maxDoc+"\tcollisions="+(cend-cstart));
|
||||||
|
}
|
||||||
|
if (ret==-1)System.out.println("wow!");
|
||||||
|
System.out.println("collisions="+HashDocSet.collisions);
|
||||||
|
}
|
||||||
|
***/
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue