SOLR-1169: SortedIntDocSet

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@776750 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-05-20 16:11:11 +00:00
parent d161e54136
commit 06f159e3fb
7 changed files with 626 additions and 81 deletions

View File

@ -254,6 +254,11 @@ Optimizations
10. SOLR-1166: Speed up docset/filter generation by avoiding top-level
score() call and iterating over leaf readers with TermDocs. (yonik)
11. SOLR-1169: SortedIntDocSet - a new small set implementation
that saves memory over HashDocSet, is faster to construct,
is ordered for easier impelemntation of skipTo, and is faster
in the general case. (yonik)
Bug Fixes
----------------------

View File

@ -306,12 +306,6 @@
queryResultCache. -->
<queryResultMaxDocsCached>200</queryResultMaxDocsCached>
<!-- This entry enables an int hash representation for filters (DocSets)
when the number of items in the set is less than maxSize. For smaller
sets, this representation is more memory efficient, more efficient to
iterate over, and faster to take intersections. -->
<HashDocSet maxSize="3000" loadFactor="0.75"/>
<!-- a newSearcher event is fired whenever a new searcher is being prepared
and there is a current searcher handling requests (aka registered). -->
<!-- QuerySenderListener takes an array of NamedList and executes a

View File

@ -189,10 +189,9 @@ abstract class DocSetBase implements DocSet {
};
public DocSet intersection(DocSet other) {
// intersection is overloaded in HashDocSet to be more
// efficient, so if "other" is a HashDocSet, dispatch off
// of it instead.
if (other instanceof HashDocSet) {
// intersection is overloaded in the smaller DocSets to be more
// efficient, so dispatch off of it instead.
if (!(other instanceof BitDocSet)) {
return other.intersection(this);
}
@ -209,10 +208,9 @@ abstract class DocSetBase implements DocSet {
}
public int intersectionSize(DocSet other) {
// intersectionSize is overloaded in HashDocSet to be more
// efficient, so if "other" is a HashDocSet, dispatch off
// of it instead.
if (other instanceof HashDocSet) {
// intersection is overloaded in the smaller DocSets to be more
// efficient, so dispatch off of it instead.
if (!(other instanceof BitDocSet)) {
return other.intersectionSize(this);
}
// less efficient way: do the intersection then get it's size

View File

@ -30,31 +30,26 @@ import java.io.IOException;
*/
final class DocSetHitCollector extends HitCollector {
final float HASHSET_INVERSE_LOAD_FACTOR;
final int HASHDOCSET_MAXSIZE;
int pos=0;
OpenBitSet bits;
final int maxDoc;
final int smallSetSize;
// in case there aren't that many hits, we may not want a very sparse
// bit array. Optimistically collect the first few docs in an array
// in case there are only a few.
final int[] scratch;
// todo - could pass in bitset and an operation also...
DocSetHitCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
DocSetHitCollector(int smallSetSize, int maxDoc) {
this.smallSetSize = smallSetSize;
this.maxDoc = maxDoc;
HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
HASHDOCSET_MAXSIZE = maxSize;
scratch = new int[HASHDOCSET_MAXSIZE];
this.scratch = new int[smallSetSize];
}
public void collect(int doc, float score) {
// optimistically collect the first docs in an array
// in case the total number will be small enough to represent
// as a HashDocSet() instead...
// as a small set like SortedIntDocSet instead...
// Storing in this array will be quicker to convert
// than scanning through a potentially huge bit vector.
// FUTURE: when search methods all start returning docs in order, maybe
@ -73,7 +68,8 @@ final class DocSetHitCollector extends HitCollector {
public DocSet getDocSet() {
if (pos<=scratch.length) {
return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
// assumes docs were collected in sorted order!
return new SortedIntDocSet(scratch, pos);
} else {
// set the bits for ids that were collected in the array
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
@ -84,33 +80,27 @@ final class DocSetHitCollector extends HitCollector {
class DocSetCollector extends Collector {
final float HASHSET_INVERSE_LOAD_FACTOR;
final int HASHDOCSET_MAXSIZE;
int pos=0;
OpenBitSet bits;
final int maxDoc;
int base=0;
final int smallSetSize;
int base;
// in case there aren't that many hits, we may not want a very sparse
// bit array. Optimistically collect the first few docs in an array
// in case there are only a few.
final int[] scratch;
// todo - could pass in bitset and an operation also...
DocSetCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
DocSetCollector(int smallSetSize, int maxDoc) {
this.smallSetSize = smallSetSize;
this.maxDoc = maxDoc;
HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
HASHDOCSET_MAXSIZE = maxSize;
scratch = new int[HASHDOCSET_MAXSIZE];
this.scratch = new int[smallSetSize];
}
public void collect(int doc) {
doc += base;
// optimistically collect the first docs in an array
// in case the total number will be small enough to represent
// as a HashDocSet() instead...
// as a small set like SortedIntDocSet instead...
// Storing in this array will be quicker to convert
// than scanning through a potentially huge bit vector.
// FUTURE: when search methods all start returning docs in order, maybe
@ -129,7 +119,8 @@ class DocSetCollector extends Collector {
public DocSet getDocSet() {
if (pos<=scratch.length) {
return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
// assumes docs were collected in sorted order!
return new SortedIntDocSet(scratch, pos);
} else {
// set the bits for ids that were collected in the array
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);

View File

@ -82,9 +82,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
private final LuceneQueryOptimizer optimizer;
private final float HASHSET_INVERSE_LOAD_FACTOR;
private final int HASHDOCSET_MAXSIZE;
// map of generic caches - not synchronized since it's read-only after the constructor.
private final HashMap<String, SolrCache> cacheMap;
private static final HashMap<String, SolrCache> noGenericCaches=new HashMap<String,SolrCache>(0);
@ -186,10 +183,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
}
optimizer = solrConfig.filtOptEnabled ? new LuceneQueryOptimizer(solrConfig.filtOptCacheSize,solrConfig.filtOptThreshold) : null;
// for DocSets
HASHSET_INVERSE_LOAD_FACTOR = solrConfig.hashSetInverseLoadFactor;
HASHDOCSET_MAXSIZE = solrConfig.hashDocSetMaxSize;
fieldNames = r.getFieldNames(IndexReader.FieldOption.ALL);
}
@ -628,7 +621,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
// query must be positive
protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
if (filter==null) {
DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
if (query instanceof TermQuery) {
Term t = ((TermQuery)query).getTerm();
SolrIndexReader[] readers = reader.getLeafReaders();
@ -656,7 +649,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
} else {
// FUTURE: if the filter is sorted by docid, could use skipTo (SkipQueryFilter)
final DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
final DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
final DocSet filt = filter;
super.search(query, null, new Collector() {
int base = 0;
@ -1131,7 +1124,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
float maxScore;
int[] ids;
float[] scores;
final DocSetHitCollector setHC = new DocSetHitCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc()>>6, maxDoc());
final HitCollector collector = ( cmd.getTimeAllowed() > 0 ) ? new TimeLimitedCollector( setHC, cmd.getTimeAllowed() ) : setHC;
Query query = QueryUtils.makeQueryable(cmd.getQuery());

View File

@ -0,0 +1,498 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import org.apache.lucene.util.OpenBitSet;
/**
* <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
*/
public class SortedIntDocSet extends DocSetBase {
protected final int[] docs;
public SortedIntDocSet(int[] docs) {
this.docs = docs;
}
public SortedIntDocSet(int[] docs, int len) {
this(shrink(docs,len));
}
public int[] getDocs() { return docs; }
public int size() { return docs.length; }
public long memSize() {
return (docs.length<<2)+8;
}
public static int[] zeroInts = new int[0];
public static SortedIntDocSet zero = new SortedIntDocSet(zeroInts);
public static int[] shrink(int[] arr, int newSize) {
if (arr.length == newSize) return arr;
int[] newArr = new int[newSize];
System.arraycopy(arr, 0, newArr, 0, newSize);
return newArr;
}
public static int intersectionSize(int[] smallerSortedList, int[] biggerSortedList) {
final int a[] = smallerSortedList;
final int b[] = biggerSortedList;
// The next doc we are looking for will be much closer to the last position we tried
// than it will be to the midpoint between last and high... so probe ahead using
// a function of the ratio of the sizes of the sets.
int step = (b.length/a.length)+1;
// Since the majority of probes should be misses, we'll already be above the last probe
// and shouldn't need to move larger than the step size on average to step over our target (and thus lower
// the high upper bound a lot.)... but if we don't go over our target, it's a big miss... so double it.
step = step + step;
// FUTURE: come up with a density such that target * density == likely position?
// then check step on one side or the other?
// (density could be cached in the DocSet)... length/maxDoc
// FUTURE: try partitioning like a sort algorithm. Pick the midpoint of the big
// array, find where that should be in the small array, and then recurse with
// the top and bottom half of both arrays until they are small enough to use
// a fallback insersection method.
// NOTE: I tried this and it worked, but it was actually slower than this current
// highly optimized approach.
int icount = 0;
int low = 0;
int max = b.length-1;
for (int i=0; i<a.length; i++) {
int doca = a[i];
int high = max;
int probe = low + step; // 40% improvement!
// short linear probe to see if we can drop the high pointer in one big jump.
if (probe<high) {
if (b[probe]>=doca) {
// success! we cut down the upper bound by a lot in one step!
high=probe;
} else {
// relative failure... we get to move the low pointer, but not my much
low=probe+1;
// reprobe worth it? it appears so!
probe = low + step;
if (probe<high) {
if (b[probe]>=doca) {
high=probe;
} else {
low=probe+1;
}
}
}
}
// binary search the rest of the way
while (low <= high) {
int mid = (low+high) >>> 1;
int docb = b[mid];
if (docb < doca) {
low = mid+1;
}
else if (docb > doca) {
high = mid-1;
}
else {
icount++;
low = mid+1; // found it, so start at next element
break;
}
}
// Didn't find it... low is now positioned on the insertion point,
// which is higher than what we were looking for, so continue using
// the same low point.
}
return icount;
}
public int intersectionSize(DocSet other) {
if (!(other instanceof SortedIntDocSet)) {
// assume other implementations are better at random access than we are,
// true of BitDocSet and HashDocSet.
int icount = 0;
for (int i=0; i<docs.length; i++) {
if (other.exists(docs[i])) icount++;
}
return icount;
}
// make "a" the smaller set.
int[] otherDocs = ((SortedIntDocSet)other).docs;
final int[] a = docs.length < otherDocs.length ? docs : otherDocs;
final int[] b = docs.length < otherDocs.length ? otherDocs : docs;
if (a.length==0) return 0;
// if b is 8 times bigger than a, use the modified binary search.
if ((b.length>>3) >= a.length) {
return intersectionSize(a,b);
}
// if they are close in size, just do a linear walk of both.
int icount=0;
int i=0,j=0;
int doca=a[i],docb=b[j];
for(;;) {
// switch on the sign bit somehow? Hopefull JVM is smart enough to just test once.
// Since set a is less dense then set b, doca is likely to be greater than docb so
// check that case first. This resulted in a 13% speedup.
if (doca > docb) {
if (++j >= b.length) break;
docb=b[j];
} else if (doca < docb) {
if (++i >= a.length) break;
doca=a[i];
} else {
icount++;
if (++i >= a.length) break;
doca=a[i];
if (++j >= b.length) break;
docb=b[j];
}
}
return icount;
}
/** puts the intersection of a and b into the target array and returns the size */
public static int intersection(int a[], int lena, int b[], int lenb, int[] target) {
if (lena > lenb) {
int ti=lena; lena=lenb; lenb=ti;
int[] ta=a; a=b; b=ta;
}
if (lena==0) return 0;
// if b is 8 times bigger than a, use the modified binary search.
if ((lenb>>3) >= lena) {
return intersectionBinarySearch(a, lena, b, lenb, target);
}
int icount=0;
int i=0,j=0;
int doca=a[i],docb=b[j];
for(;;) {
if (doca > docb) {
if (++j >= lenb) break;
docb=b[j];
} else if (doca < docb) {
if (++i >= lena) break;
doca=a[i];
} else {
target[icount++] = doca;
if (++i >= lena) break;
doca=a[i];
if (++j >= lenb) break;
docb=b[j];
}
}
return icount;
}
/** Puts the intersection of a and b into the target array and returns the size.
* lena should be smaller than lenb */
protected static int intersectionBinarySearch(int[] a, int lena, int[] b, int lenb, int[] target) {
int step = (lenb/lena)+1;
step = step + step;
int icount = 0;
int low = 0;
int max = lenb-1;
for (int i=0; i<lena; i++) {
int doca = a[i];
int high = max;
int probe = low + step; // 40% improvement!
// short linear probe to see if we can drop the high pointer in one big jump.
if (probe<high) {
if (b[probe]>=doca) {
// success! we cut down the upper bound by a lot in one step!
high=probe;
} else {
// relative failure... we get to move the low pointer, but not my much
low=probe+1;
// reprobe worth it? it appears so!
probe = low + step;
if (probe<high) {
if (b[probe]>=doca) {
high=probe;
} else {
low=probe+1;
}
}
}
}
// binary search
while (low <= high) {
int mid = (low+high) >>> 1;
int docb = b[mid];
if (docb < doca) {
low = mid+1;
}
else if (docb > doca) {
high = mid-1;
}
else {
target[icount++] = doca;
low = mid+1; // found it, so start at next element
break;
}
}
// Didn't find it... low is now positioned on the insertion point,
// which is higher than what we were looking for, so continue using
// the same low point.
}
return icount;
}
@Override
public DocSet intersection(DocSet other) {
if (!(other instanceof SortedIntDocSet)) {
int icount = 0;
int arr[] = new int[docs.length];
for (int i=0; i<docs.length; i++) {
int doc = docs[i];
if (other.exists(doc)) arr[icount++] = doc;
}
return new SortedIntDocSet(arr,icount);
}
int[] otherDocs = ((SortedIntDocSet)other).docs;
int maxsz = Math.min(docs.length, otherDocs.length);
int[] arr = new int[maxsz];
int sz = intersection(docs, docs.length, otherDocs, otherDocs.length, arr);
return new SortedIntDocSet(arr,sz);
}
protected static int andNotBinarySearch(int a[], int lena, int b[], int lenb, int[] target) {
int step = (lenb/lena)+1;
step = step + step;
int count = 0;
int low = 0;
int max = lenb-1;
outer:
for (int i=0; i<lena; i++) {
int doca = a[i];
int high = max;
int probe = low + step; // 40% improvement!
// short linear probe to see if we can drop the high pointer in one big jump.
if (probe<high) {
if (b[probe]>=doca) {
// success! we cut down the upper bound by a lot in one step!
high=probe;
} else {
// relative failure... we get to move the low pointer, but not my much
low=probe+1;
// reprobe worth it? it appears so!
probe = low + step;
if (probe<high) {
if (b[probe]>=doca) {
high=probe;
} else {
low=probe+1;
}
}
}
}
// binary search
while (low <= high) {
int mid = (low+high) >>> 1;
int docb = b[mid];
if (docb < doca) {
low = mid+1;
}
else if (docb > doca) {
high = mid-1;
}
else {
low = mid+1; // found it, so start at next element
continue outer;
}
}
// Didn't find it... low is now positioned on the insertion point,
// which is higher than what we were looking for, so continue using
// the same low point.
target[count++] = doca;
}
return count;
}
/** puts the intersection of a and not b into the target array and returns the size */
public static int andNot(int a[], int lena, int b[], int lenb, int[] target) {
if (lena==0) return 0;
if (lenb==0) {
System.arraycopy(a,0,target,0,lena);
return lena;
}
// if b is 8 times bigger than a, use the modified binary search.
if ((lenb>>3) >= lena) {
return andNotBinarySearch(a, lena, b, lenb, target);
}
int count=0;
int i=0,j=0;
int doca=a[i],docb=b[j];
for(;;) {
if (doca > docb) {
if (++j >= lenb) break;
docb=b[j];
} else if (doca < docb) {
target[count++] = doca;
if (++i >= lena) break;
doca=a[i];
} else {
if (++i >= lena) break;
doca=a[i];
if (++j >= lenb) break;
docb=b[j];
}
}
int leftover=lena - i;
if (leftover > 0) {
System.arraycopy(a,i,target,count,leftover);
count += leftover;
}
return count;
}
@Override
public DocSet andNot(DocSet other) {
if (other.size()==0) return this;
if (!(other instanceof SortedIntDocSet)) {
int count = 0;
int arr[] = new int[docs.length];
for (int i=0; i<docs.length; i++) {
int doc = docs[i];
if (!other.exists(doc)) arr[count++] = doc;
}
return new SortedIntDocSet(arr,count);
}
int[] otherDocs = ((SortedIntDocSet)other).docs;
int[] arr = new int[docs.length];
int sz = andNot(docs, docs.length, otherDocs, otherDocs.length, arr);
return new SortedIntDocSet(arr,sz);
}
public boolean exists(int doc) {
// this could be faster by estimating where in the list the doc is likely to appear,
// but we should get away from using exists() anyway.
int low = 0;
int high = docs.length-1;
// binary search
while (low <= high) {
int mid = (low+high) >>> 1;
int docb = docs[mid];
if (docb < doc) {
low = mid+1;
}
else if (docb > doc) {
high = mid-1;
}
else {
return true;
}
}
return false;
}
public DocIterator iterator() {
return new DocIterator() {
int pos=0;
public boolean hasNext() {
return pos < docs.length;
}
public Integer next() {
return nextDoc();
}
/**
* The remove operation is not supported by this Iterator.
*/
public void remove() {
throw new UnsupportedOperationException("The remove operation is not supported by this Iterator.");
}
public int nextDoc() {
return docs[pos++];
}
public float score() {
return 0.0f;
}
};
}
@Override
public OpenBitSet getBits() {
int maxDoc = size() > 0 ? docs[size()-1] : 0;
OpenBitSet bs = new OpenBitSet(maxDoc+1);
for (int doc : docs) {
bs.fastSet(doc);
}
return bs;
}
}

View File

@ -20,6 +20,7 @@ package org.apache.solr.search;
import junit.framework.TestCase;
import java.util.Random;
import java.util.Arrays;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetIterator;
@ -49,41 +50,75 @@ public class TestDocSet extends TestCase {
return new HashDocSet(docs,0,docs.length);
}
public DocSet getIntDocSet(OpenBitSet bs) {
int[] docs = new int[(int)bs.cardinality()];
OpenBitSetIterator iter = new OpenBitSetIterator(bs);
for (int i=0; i<docs.length; i++) {
docs[i] = iter.nextDoc();
}
return new SortedIntDocSet(docs);
}
public DocSet getBitDocSet(OpenBitSet bs) {
return new BitDocSet(bs);
}
public DocSet getDocSet(OpenBitSet bs) {
return rand.nextInt(2)==0 ? getHashDocSet(bs) : getBitDocSet(bs);
switch(rand.nextInt(3)) {
case 0: return getIntDocSet(bs);
case 1: return getHashDocSet(bs);
case 2: return getBitDocSet(bs);
}
return null;
}
public void checkEqual(OpenBitSet bs, DocSet set) {
for (int i=0; i<bs.capacity(); i++) {
assertEquals(bs.get(i), set.exists(i));
}
assertEquals(bs.cardinality(), set.size());
}
public void iter(DocSet d1, DocSet d2) {
// HashDocSet doesn't iterate in order.
if (d1 instanceof HashDocSet || d2 instanceof HashDocSet) return;
DocIterator i1 = d1.iterator();
DocIterator i2 = d2.iterator();
assert(i1.hasNext() == i2.hasNext());
for(;;) {
boolean b1 = i1.hasNext();
boolean b2 = i2.hasNext();
assertEquals(b1,b2);
if (!b1) break;
assertEquals(i1.nextDoc(), i2.nextDoc());
}
}
protected void doSingle(int maxSize) {
int sz = rand.nextInt(maxSize+1);
int sz2 = rand.nextInt(maxSize);
OpenBitSet a1 = getRandomSet(sz, rand.nextInt(sz+1));
OpenBitSet a2 = getRandomSet(sz, rand.nextInt(sz2+1));
OpenBitSet bs1 = getRandomSet(sz, rand.nextInt(sz+1));
OpenBitSet bs2 = getRandomSet(sz, rand.nextInt(sz2+1));
DocSet b1 = getDocSet(a1);
DocSet b2 = getDocSet(a2);
DocSet a1 = new BitDocSet(bs1);
DocSet a2 = new BitDocSet(bs2);
DocSet b1 = getDocSet(bs1);
DocSet b2 = getDocSet(bs2);
// System.out.println("b1="+b1+", b2="+b2);
checkEqual(bs1,b1);
checkEqual(bs2,b2);
assertEquals((int)a1.cardinality(), b1.size());
assertEquals((int)a2.cardinality(), b2.size());
iter(a1,b1);
iter(a2,b2);
checkEqual(a1,b1);
checkEqual(a2,b2);
OpenBitSet a_and = (OpenBitSet)a1.clone(); a_and.and(a2);
OpenBitSet a_or = (OpenBitSet)a1.clone(); a_or.or(a2);
// OpenBitSet a_xor = (OpenBitSet)a1.clone(); a_xor.xor(a2);
OpenBitSet a_andn = (OpenBitSet)a1.clone(); a_andn.andNot(a2);
OpenBitSet a_and = (OpenBitSet) bs1.clone(); a_and.and(bs2);
OpenBitSet a_or = (OpenBitSet) bs1.clone(); a_or.or(bs2);
// OpenBitSet a_xor = (OpenBitSet)bs1.clone(); a_xor.xor(bs2);
OpenBitSet a_andn = (OpenBitSet) bs1.clone(); a_andn.andNot(bs2);
checkEqual(a_and, b1.intersection(b2));
checkEqual(a_or, b1.union(b2));
@ -102,12 +137,15 @@ public class TestDocSet extends TestCase {
}
public void testRandomDocSets() {
doMany(300, 5000);
// Make the size big enough to go over certain limits (such as one set
// being 8 times the size of another in the int set, or going over 2 times
// 64 bits for the bit doc set. Smaller sets can hit more boundary conditions though.
doMany(130, 10000);
//doMany(130, 1000000);
}
public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
int n = rand.nextInt(maxSetSize);
public DocSet getRandomDocSet(int n, int maxDoc) {
OpenBitSet obs = new OpenBitSet(maxDoc);
int[] a = new int[n];
for (int i=0; i<n; i++) {
@ -118,14 +156,29 @@ public class TestDocSet extends TestCase {
break;
}
}
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
if (n <= smallSetCuttoff) {
if (smallSetType ==0) {
Arrays.sort(a);
return new SortedIntDocSet(a);
} else if (smallSetType ==1) {
Arrays.sort(a);
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
}
}
return new BitDocSet(obs, n);
}
public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
public DocSet[] getRandomSets(int nSets, int minSetSize, int maxSetSize, int maxDoc) {
DocSet[] sets = new DocSet[nSets];
for (int i=0; i<nSets; i++) {
sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
int sz;
sz = rand.nextInt(maxSetSize-minSetSize+1)+minSetSize;
// different distribution
// sz = (maxSetSize+1)/(rand.nextInt(maxSetSize)+1) + minSetSize;
sets[i] = getRandomDocSet(sz,maxDoc);
}
return sets;
@ -160,30 +213,43 @@ public class TestDocSet extends TestCase {
}
***/
public static int smallSetType = 0; // 0==sortedint, 1==hash, 2==openbitset
public static int smallSetCuttoff=3000;
/***
public void testIntersectionSizePerformance() {
loadfactor=.75f;
rand=new Random(12345); // make deterministic
int maxSetsize=4000;
int nSets=128;
int iter=10;
loadfactor=.75f; // for HashDocSet
rand=new Random(1); // make deterministic
int minBigSetSize=1,maxBigSetSize=30000;
int minSmallSetSize=1,maxSmallSetSize=30000;
int nSets=1024;
int iter=1;
int maxDoc=1000000;
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
smallSetCuttoff = maxDoc>>6; // break even for SortedIntSet is /32... but /64 is better for performance
// smallSetCuttoff = maxDoc;
DocSet[] bigsets = getRandomSets(nSets, minBigSetSize, maxBigSetSize, maxDoc);
DocSet[] smallsets = getRandomSets(nSets, minSmallSetSize, maxSmallSetSize, maxDoc);
int ret=0;
long start=System.currentTimeMillis();
for (int i=0; i<iter; i++) {
for (DocSet s1 : sets) {
for (DocSet s2 : sets) {
for (DocSet s1 : bigsets) {
for (DocSet s2 : smallsets) {
ret += s1.intersectionSize(s2);
}
}
}
long end=System.currentTimeMillis();
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
if (ret==-1)System.out.println("wow!");
System.out.println("intersectionSizePerformance="+(end-start)+" ms");
System.out.println("ret="+ret);
}
***/
/****
public void testExistsPerformance() {
loadfactor=.75f;
rand=new Random(12345); // make deterministic