From 33ad0e6975ee958a233c681fc4448254bf3f9466 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Sun, 21 Jan 2007 05:46:31 +0000 Subject: [PATCH] HashDocSet new hash, union, andNot: SOLR-114 git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@498246 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 +- .../org/apache/solr/search/BitDocSet.java | 25 +++ .../org/apache/solr/search/HashDocSet.java | 172 ++++++++---------- .../org/apache/solr/search/TestDocSet.java | 132 ++++++++++++++ 4 files changed, 233 insertions(+), 101 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 49fc177d477..69f214cc833 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -52,7 +52,10 @@ Changes in runtime behavior user query, not boost or filter queries (klaas). Optimizations - 1. + 1. SOLR-114: HashDocSet specific implementations of union() and andNot() + for a 20x performance improvement for those set operations, and a new + hash algorithm speeds up exists() by 10% and intersectionSize() by 8%. + (yonik) Bug Fixes 1. SOLR-87: Parsing of synonym files did not correctly handle escaped diff --git a/src/java/org/apache/solr/search/BitDocSet.java b/src/java/org/apache/solr/search/BitDocSet.java index fa685d53d4d..903b577d6b6 100644 --- a/src/java/org/apache/solr/search/BitDocSet.java +++ b/src/java/org/apache/solr/search/BitDocSet.java @@ -177,6 +177,31 @@ public class BitDocSet extends DocSetBase { } } + @Override + public DocSet andNot(DocSet other) { + OpenBitSet newbits = (OpenBitSet)(bits.clone()); + if (other instanceof OpenBitSet) { + newbits.andNot(((BitDocSet)other).bits); + } else { + DocIterator iter = other.iterator(); + while (iter.hasNext()) newbits.clear(iter.nextDoc()); + } + return new BitDocSet(newbits); + } + + @Override + public DocSet union(DocSet other) { + OpenBitSet newbits = (OpenBitSet)(bits.clone()); + if (other instanceof BitDocSet) { + newbits.union(((BitDocSet)other).bits); + } else { + DocIterator iter = other.iterator(); + while (iter.hasNext()) newbits.set(iter.nextDoc()); + } + return new BitDocSet(newbits); + } + + public long memSize() { return (bits.getBits().length << 3) + 16; } diff --git a/src/java/org/apache/solr/search/HashDocSet.java b/src/java/org/apache/solr/search/HashDocSet.java index fc660892618..4b4e8344cc5 100644 --- a/src/java/org/apache/solr/search/HashDocSet.java +++ b/src/java/org/apache/solr/search/HashDocSet.java @@ -31,7 +31,6 @@ import org.apache.solr.util.BitUtil; * @since solr 0.9 */ public final class HashDocSet extends DocSetBase { - // final static float inverseLoadfactor = 1.0f / SolrConfig.config.getFloat("//HashDocSet/@loadFactor",0.75f); /** Default load factor to use for HashDocSets. We keep track of the inverse * since multiplication is so much faster than division. The default * is 1.0f / 0.75f @@ -45,27 +44,27 @@ public final class HashDocSet extends DocSetBase { // an alternative to having to init the array to EMPTY at the start is // private final static int EMPTY=-1; - private final int tablesize; private final int[] table; private final int size; private final int mask; + /** Create a HashDocSet from a list of *unique* ids */ public HashDocSet(int[] docs, int offset, int len) { this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR); } + /** Create a HashDocSet from a list of *unique* ids */ public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) { int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1); if (tsize < len * inverseLoadFactor) { tsize <<= 1; } - tablesize = tsize; - mask=tablesize-1; + mask=tsize-1; - table = new int[tablesize]; - for (int i=0; i=0; i--) table[i]=EMPTY; for (int i=offset; i>8); - s = h & mask; - v = table[s]; - if (v==EMPTY || v==val) return s; - - h ^= (v << 17) | (comp >>> 16); // this is reversible - s = h & mask; - v = table[s]; - if (v==EMPTY || v==val) return s; - - h ^= (h << 8) | (comp >>> 25); // this is reversible - s = h & mask; - v = table[s]; - if (v==EMPTY || v==val) return s; - - /********************** - // Knuth, Thomas Wang, http://www.concentric.net/~Ttwang/tech/inthash.htm - // This magic number has no common factors with 2^32, and magic/(2^32) approximates - // the golden ratio. - private static final int magic = (int)2654435761L; - - h = magic*val; - s = h & mask; - v=table[s]; - if (v==EMPTY || v==val) return s; - - // the mult with magic should have thoroughly mixed the bits. - // add entropy to the right half from the left half. - h ^= h>>>16; - s = h & mask; - v=table[s]; - if (v==EMPTY || v==val) return s; - *************************/ - - // linear scan now... ug. - final int start=s; - while (++s>7)|1) only once. + // otherwise, we would need to pull the first case out of the loop. + s = (s + ((doc>>7)|1)) & mask; } - s=start; - while (--s>=0) { - v=table[s]; - if (v==EMPTY || v==val) return s; + table[s]=doc; + } + + public boolean exists(int doc) { + int s = doc & mask; + for(;;) { + int v = table[s]; + if (v==EMPTY) return false; + if (v==doc) return true; + // see put() for algorithm details. + s = (s + ((doc>>7)|1)) & mask; } - return s; } - /** - * - * @return The number of document ids in the set. - */ public int size() { return size; } - public boolean exists(int docid) { - int v = table[docid & mask]; - if (v==EMPTY) return false; - else if (v==docid) return true; - else { - v = table[rehash(docid)]; - if (v==docid) return true; - else return false; - } - } - public DocIterator iterator() { return new DocIterator() { int pos=0; @@ -180,7 +111,7 @@ public final class HashDocSet extends DocSetBase { { goNext(); } public boolean hasNext() { - return pos < tablesize; + return pos < table.length; } public Integer next() { @@ -191,7 +122,7 @@ public final class HashDocSet extends DocSetBase { } void goNext() { - while (pos= 0 && !other.exists(id)) { + result[resultCount++]=id; + } + } + return new HashDocSet(result,0,resultCount); + } + + @Override + public DocSet union(DocSet other) { + if (other instanceof HashDocSet) { + // set "a" to the smallest doc set + final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; + final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; + + int[] result = new int[a.size()+b.size()]; + int resultCount=0; + // iterate over the largest table first, adding w/o checking. + for (int i=0; i=0) result[resultCount++]=id; + } + + // now iterate over smaller set, adding all not already in larger set. + for (int i=0; i=0 && !b.exists(id)) result[resultCount++]=id; + } + + return new HashDocSet(result,0,resultCount); + } else { + return other.union(this); + } } diff --git a/src/test/org/apache/solr/search/TestDocSet.java b/src/test/org/apache/solr/search/TestDocSet.java index 9855378fe30..35c6099ff76 100644 --- a/src/test/org/apache/solr/search/TestDocSet.java +++ b/src/test/org/apache/solr/search/TestDocSet.java @@ -23,6 +23,7 @@ import java.util.Random; import org.apache.solr.util.OpenBitSet; import org.apache.solr.util.BitSetIterator; +import org.apache.solr.util.BitUtil; /** * @author yonik @@ -30,6 +31,7 @@ import org.apache.solr.util.BitSetIterator; */ public class TestDocSet extends TestCase { Random rand = new Random(); + float loadfactor; public OpenBitSet getRandomSet(int sz, int bitsToSet) { OpenBitSet bs = new OpenBitSet(sz); @@ -105,4 +107,134 @@ public class TestDocSet extends TestCase { doMany(300, 5000); } + + public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) { + int n = rand.nextInt(maxSetSize); + OpenBitSet obs = new OpenBitSet(maxDoc); + int[] a = new int[n]; + for (int i=0; i>1)-1; + DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc); + int cstart = HashDocSet.collisions; + for (DocSet s1 : sets) { + for (int j=0; j