mirror of https://github.com/apache/lucene.git
SOLR-1169: SortedIntDocSet
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@776750 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d161e54136
commit
06f159e3fb
|
@ -254,6 +254,11 @@ Optimizations
|
|||
10. SOLR-1166: Speed up docset/filter generation by avoiding top-level
|
||||
score() call and iterating over leaf readers with TermDocs. (yonik)
|
||||
|
||||
11. SOLR-1169: SortedIntDocSet - a new small set implementation
|
||||
that saves memory over HashDocSet, is faster to construct,
|
||||
is ordered for easier impelemntation of skipTo, and is faster
|
||||
in the general case. (yonik)
|
||||
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
|
|
@ -306,12 +306,6 @@
|
|||
queryResultCache. -->
|
||||
<queryResultMaxDocsCached>200</queryResultMaxDocsCached>
|
||||
|
||||
<!-- This entry enables an int hash representation for filters (DocSets)
|
||||
when the number of items in the set is less than maxSize. For smaller
|
||||
sets, this representation is more memory efficient, more efficient to
|
||||
iterate over, and faster to take intersections. -->
|
||||
<HashDocSet maxSize="3000" loadFactor="0.75"/>
|
||||
|
||||
<!-- a newSearcher event is fired whenever a new searcher is being prepared
|
||||
and there is a current searcher handling requests (aka registered). -->
|
||||
<!-- QuerySenderListener takes an array of NamedList and executes a
|
||||
|
|
|
@ -189,10 +189,9 @@ abstract class DocSetBase implements DocSet {
|
|||
};
|
||||
|
||||
public DocSet intersection(DocSet other) {
|
||||
// intersection is overloaded in HashDocSet to be more
|
||||
// efficient, so if "other" is a HashDocSet, dispatch off
|
||||
// of it instead.
|
||||
if (other instanceof HashDocSet) {
|
||||
// intersection is overloaded in the smaller DocSets to be more
|
||||
// efficient, so dispatch off of it instead.
|
||||
if (!(other instanceof BitDocSet)) {
|
||||
return other.intersection(this);
|
||||
}
|
||||
|
||||
|
@ -209,10 +208,9 @@ abstract class DocSetBase implements DocSet {
|
|||
}
|
||||
|
||||
public int intersectionSize(DocSet other) {
|
||||
// intersectionSize is overloaded in HashDocSet to be more
|
||||
// efficient, so if "other" is a HashDocSet, dispatch off
|
||||
// of it instead.
|
||||
if (other instanceof HashDocSet) {
|
||||
// intersection is overloaded in the smaller DocSets to be more
|
||||
// efficient, so dispatch off of it instead.
|
||||
if (!(other instanceof BitDocSet)) {
|
||||
return other.intersectionSize(this);
|
||||
}
|
||||
// less efficient way: do the intersection then get it's size
|
||||
|
|
|
@ -30,31 +30,26 @@ import java.io.IOException;
|
|||
*/
|
||||
|
||||
final class DocSetHitCollector extends HitCollector {
|
||||
|
||||
final float HASHSET_INVERSE_LOAD_FACTOR;
|
||||
final int HASHDOCSET_MAXSIZE;
|
||||
|
||||
int pos=0;
|
||||
OpenBitSet bits;
|
||||
final int maxDoc;
|
||||
final int smallSetSize;
|
||||
|
||||
// in case there aren't that many hits, we may not want a very sparse
|
||||
// bit array. Optimistically collect the first few docs in an array
|
||||
// in case there are only a few.
|
||||
final int[] scratch;
|
||||
|
||||
// todo - could pass in bitset and an operation also...
|
||||
DocSetHitCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
|
||||
DocSetHitCollector(int smallSetSize, int maxDoc) {
|
||||
this.smallSetSize = smallSetSize;
|
||||
this.maxDoc = maxDoc;
|
||||
HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
|
||||
HASHDOCSET_MAXSIZE = maxSize;
|
||||
scratch = new int[HASHDOCSET_MAXSIZE];
|
||||
this.scratch = new int[smallSetSize];
|
||||
}
|
||||
|
||||
public void collect(int doc, float score) {
|
||||
// optimistically collect the first docs in an array
|
||||
// in case the total number will be small enough to represent
|
||||
// as a HashDocSet() instead...
|
||||
// as a small set like SortedIntDocSet instead...
|
||||
// Storing in this array will be quicker to convert
|
||||
// than scanning through a potentially huge bit vector.
|
||||
// FUTURE: when search methods all start returning docs in order, maybe
|
||||
|
@ -73,7 +68,8 @@ final class DocSetHitCollector extends HitCollector {
|
|||
|
||||
public DocSet getDocSet() {
|
||||
if (pos<=scratch.length) {
|
||||
return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
|
||||
// assumes docs were collected in sorted order!
|
||||
return new SortedIntDocSet(scratch, pos);
|
||||
} else {
|
||||
// set the bits for ids that were collected in the array
|
||||
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
|
||||
|
@ -84,33 +80,27 @@ final class DocSetHitCollector extends HitCollector {
|
|||
|
||||
|
||||
class DocSetCollector extends Collector {
|
||||
|
||||
final float HASHSET_INVERSE_LOAD_FACTOR;
|
||||
final int HASHDOCSET_MAXSIZE;
|
||||
|
||||
int pos=0;
|
||||
OpenBitSet bits;
|
||||
final int maxDoc;
|
||||
int base=0;
|
||||
final int smallSetSize;
|
||||
int base;
|
||||
|
||||
// in case there aren't that many hits, we may not want a very sparse
|
||||
// bit array. Optimistically collect the first few docs in an array
|
||||
// in case there are only a few.
|
||||
final int[] scratch;
|
||||
|
||||
// todo - could pass in bitset and an operation also...
|
||||
DocSetCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
|
||||
DocSetCollector(int smallSetSize, int maxDoc) {
|
||||
this.smallSetSize = smallSetSize;
|
||||
this.maxDoc = maxDoc;
|
||||
HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
|
||||
HASHDOCSET_MAXSIZE = maxSize;
|
||||
scratch = new int[HASHDOCSET_MAXSIZE];
|
||||
this.scratch = new int[smallSetSize];
|
||||
}
|
||||
|
||||
public void collect(int doc) {
|
||||
doc += base;
|
||||
// optimistically collect the first docs in an array
|
||||
// in case the total number will be small enough to represent
|
||||
// as a HashDocSet() instead...
|
||||
// as a small set like SortedIntDocSet instead...
|
||||
// Storing in this array will be quicker to convert
|
||||
// than scanning through a potentially huge bit vector.
|
||||
// FUTURE: when search methods all start returning docs in order, maybe
|
||||
|
@ -129,7 +119,8 @@ class DocSetCollector extends Collector {
|
|||
|
||||
public DocSet getDocSet() {
|
||||
if (pos<=scratch.length) {
|
||||
return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
|
||||
// assumes docs were collected in sorted order!
|
||||
return new SortedIntDocSet(scratch, pos);
|
||||
} else {
|
||||
// set the bits for ids that were collected in the array
|
||||
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
|
||||
|
|
|
@ -82,9 +82,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
|||
|
||||
private final LuceneQueryOptimizer optimizer;
|
||||
|
||||
private final float HASHSET_INVERSE_LOAD_FACTOR;
|
||||
private final int HASHDOCSET_MAXSIZE;
|
||||
|
||||
// map of generic caches - not synchronized since it's read-only after the constructor.
|
||||
private final HashMap<String, SolrCache> cacheMap;
|
||||
private static final HashMap<String, SolrCache> noGenericCaches=new HashMap<String,SolrCache>(0);
|
||||
|
@ -186,10 +183,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
|||
}
|
||||
optimizer = solrConfig.filtOptEnabled ? new LuceneQueryOptimizer(solrConfig.filtOptCacheSize,solrConfig.filtOptThreshold) : null;
|
||||
|
||||
// for DocSets
|
||||
HASHSET_INVERSE_LOAD_FACTOR = solrConfig.hashSetInverseLoadFactor;
|
||||
HASHDOCSET_MAXSIZE = solrConfig.hashDocSetMaxSize;
|
||||
|
||||
fieldNames = r.getFieldNames(IndexReader.FieldOption.ALL);
|
||||
}
|
||||
|
||||
|
@ -628,7 +621,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
|||
// query must be positive
|
||||
protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
|
||||
if (filter==null) {
|
||||
DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
|
||||
DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
|
||||
if (query instanceof TermQuery) {
|
||||
Term t = ((TermQuery)query).getTerm();
|
||||
SolrIndexReader[] readers = reader.getLeafReaders();
|
||||
|
@ -656,7 +649,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
|||
|
||||
} else {
|
||||
// FUTURE: if the filter is sorted by docid, could use skipTo (SkipQueryFilter)
|
||||
final DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
|
||||
final DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
|
||||
final DocSet filt = filter;
|
||||
super.search(query, null, new Collector() {
|
||||
int base = 0;
|
||||
|
@ -1131,7 +1124,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
|||
float maxScore;
|
||||
int[] ids;
|
||||
float[] scores;
|
||||
final DocSetHitCollector setHC = new DocSetHitCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
|
||||
final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc()>>6, maxDoc());
|
||||
final HitCollector collector = ( cmd.getTimeAllowed() > 0 ) ? new TimeLimitedCollector( setHC, cmd.getTimeAllowed() ) : setHC;
|
||||
|
||||
Query query = QueryUtils.makeQueryable(cmd.getQuery());
|
||||
|
|
|
@ -0,0 +1,498 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.search;
|
||||
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
|
||||
/**
|
||||
* <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
|
||||
*/
|
||||
public class SortedIntDocSet extends DocSetBase {
|
||||
protected final int[] docs;
|
||||
|
||||
public SortedIntDocSet(int[] docs) {
|
||||
this.docs = docs;
|
||||
}
|
||||
|
||||
public SortedIntDocSet(int[] docs, int len) {
|
||||
this(shrink(docs,len));
|
||||
}
|
||||
|
||||
public int[] getDocs() { return docs; }
|
||||
|
||||
public int size() { return docs.length; }
|
||||
|
||||
public long memSize() {
|
||||
return (docs.length<<2)+8;
|
||||
}
|
||||
|
||||
public static int[] zeroInts = new int[0];
|
||||
public static SortedIntDocSet zero = new SortedIntDocSet(zeroInts);
|
||||
|
||||
public static int[] shrink(int[] arr, int newSize) {
|
||||
if (arr.length == newSize) return arr;
|
||||
int[] newArr = new int[newSize];
|
||||
System.arraycopy(arr, 0, newArr, 0, newSize);
|
||||
return newArr;
|
||||
}
|
||||
|
||||
public static int intersectionSize(int[] smallerSortedList, int[] biggerSortedList) {
|
||||
final int a[] = smallerSortedList;
|
||||
final int b[] = biggerSortedList;
|
||||
|
||||
// The next doc we are looking for will be much closer to the last position we tried
|
||||
// than it will be to the midpoint between last and high... so probe ahead using
|
||||
// a function of the ratio of the sizes of the sets.
|
||||
int step = (b.length/a.length)+1;
|
||||
|
||||
// Since the majority of probes should be misses, we'll already be above the last probe
|
||||
// and shouldn't need to move larger than the step size on average to step over our target (and thus lower
|
||||
// the high upper bound a lot.)... but if we don't go over our target, it's a big miss... so double it.
|
||||
step = step + step;
|
||||
|
||||
// FUTURE: come up with a density such that target * density == likely position?
|
||||
// then check step on one side or the other?
|
||||
// (density could be cached in the DocSet)... length/maxDoc
|
||||
|
||||
// FUTURE: try partitioning like a sort algorithm. Pick the midpoint of the big
|
||||
// array, find where that should be in the small array, and then recurse with
|
||||
// the top and bottom half of both arrays until they are small enough to use
|
||||
// a fallback insersection method.
|
||||
// NOTE: I tried this and it worked, but it was actually slower than this current
|
||||
// highly optimized approach.
|
||||
|
||||
int icount = 0;
|
||||
int low = 0;
|
||||
int max = b.length-1;
|
||||
|
||||
for (int i=0; i<a.length; i++) {
|
||||
int doca = a[i];
|
||||
|
||||
int high = max;
|
||||
|
||||
int probe = low + step; // 40% improvement!
|
||||
|
||||
// short linear probe to see if we can drop the high pointer in one big jump.
|
||||
if (probe<high) {
|
||||
if (b[probe]>=doca) {
|
||||
// success! we cut down the upper bound by a lot in one step!
|
||||
high=probe;
|
||||
} else {
|
||||
// relative failure... we get to move the low pointer, but not my much
|
||||
low=probe+1;
|
||||
|
||||
// reprobe worth it? it appears so!
|
||||
probe = low + step;
|
||||
if (probe<high) {
|
||||
if (b[probe]>=doca) {
|
||||
high=probe;
|
||||
} else {
|
||||
low=probe+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// binary search the rest of the way
|
||||
while (low <= high) {
|
||||
int mid = (low+high) >>> 1;
|
||||
int docb = b[mid];
|
||||
|
||||
if (docb < doca) {
|
||||
low = mid+1;
|
||||
}
|
||||
else if (docb > doca) {
|
||||
high = mid-1;
|
||||
}
|
||||
else {
|
||||
icount++;
|
||||
low = mid+1; // found it, so start at next element
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Didn't find it... low is now positioned on the insertion point,
|
||||
// which is higher than what we were looking for, so continue using
|
||||
// the same low point.
|
||||
}
|
||||
|
||||
return icount;
|
||||
}
|
||||
|
||||
public int intersectionSize(DocSet other) {
|
||||
if (!(other instanceof SortedIntDocSet)) {
|
||||
// assume other implementations are better at random access than we are,
|
||||
// true of BitDocSet and HashDocSet.
|
||||
int icount = 0;
|
||||
for (int i=0; i<docs.length; i++) {
|
||||
if (other.exists(docs[i])) icount++;
|
||||
}
|
||||
return icount;
|
||||
}
|
||||
|
||||
// make "a" the smaller set.
|
||||
int[] otherDocs = ((SortedIntDocSet)other).docs;
|
||||
final int[] a = docs.length < otherDocs.length ? docs : otherDocs;
|
||||
final int[] b = docs.length < otherDocs.length ? otherDocs : docs;
|
||||
|
||||
if (a.length==0) return 0;
|
||||
|
||||
// if b is 8 times bigger than a, use the modified binary search.
|
||||
if ((b.length>>3) >= a.length) {
|
||||
return intersectionSize(a,b);
|
||||
}
|
||||
|
||||
// if they are close in size, just do a linear walk of both.
|
||||
int icount=0;
|
||||
int i=0,j=0;
|
||||
int doca=a[i],docb=b[j];
|
||||
for(;;) {
|
||||
// switch on the sign bit somehow? Hopefull JVM is smart enough to just test once.
|
||||
|
||||
// Since set a is less dense then set b, doca is likely to be greater than docb so
|
||||
// check that case first. This resulted in a 13% speedup.
|
||||
if (doca > docb) {
|
||||
if (++j >= b.length) break;
|
||||
docb=b[j];
|
||||
} else if (doca < docb) {
|
||||
if (++i >= a.length) break;
|
||||
doca=a[i];
|
||||
} else {
|
||||
icount++;
|
||||
if (++i >= a.length) break;
|
||||
doca=a[i];
|
||||
if (++j >= b.length) break;
|
||||
docb=b[j];
|
||||
}
|
||||
}
|
||||
return icount;
|
||||
}
|
||||
|
||||
|
||||
/** puts the intersection of a and b into the target array and returns the size */
|
||||
public static int intersection(int a[], int lena, int b[], int lenb, int[] target) {
|
||||
if (lena > lenb) {
|
||||
int ti=lena; lena=lenb; lenb=ti;
|
||||
int[] ta=a; a=b; b=ta;
|
||||
}
|
||||
|
||||
if (lena==0) return 0;
|
||||
|
||||
|
||||
// if b is 8 times bigger than a, use the modified binary search.
|
||||
if ((lenb>>3) >= lena) {
|
||||
return intersectionBinarySearch(a, lena, b, lenb, target);
|
||||
}
|
||||
|
||||
int icount=0;
|
||||
int i=0,j=0;
|
||||
int doca=a[i],docb=b[j];
|
||||
for(;;) {
|
||||
if (doca > docb) {
|
||||
if (++j >= lenb) break;
|
||||
docb=b[j];
|
||||
} else if (doca < docb) {
|
||||
if (++i >= lena) break;
|
||||
doca=a[i];
|
||||
} else {
|
||||
target[icount++] = doca;
|
||||
if (++i >= lena) break;
|
||||
doca=a[i];
|
||||
if (++j >= lenb) break;
|
||||
docb=b[j];
|
||||
}
|
||||
}
|
||||
return icount;
|
||||
}
|
||||
|
||||
/** Puts the intersection of a and b into the target array and returns the size.
|
||||
* lena should be smaller than lenb */
|
||||
protected static int intersectionBinarySearch(int[] a, int lena, int[] b, int lenb, int[] target) {
|
||||
int step = (lenb/lena)+1;
|
||||
step = step + step;
|
||||
|
||||
|
||||
int icount = 0;
|
||||
int low = 0;
|
||||
int max = lenb-1;
|
||||
|
||||
for (int i=0; i<lena; i++) {
|
||||
int doca = a[i];
|
||||
|
||||
int high = max;
|
||||
|
||||
int probe = low + step; // 40% improvement!
|
||||
|
||||
// short linear probe to see if we can drop the high pointer in one big jump.
|
||||
if (probe<high) {
|
||||
if (b[probe]>=doca) {
|
||||
// success! we cut down the upper bound by a lot in one step!
|
||||
high=probe;
|
||||
} else {
|
||||
// relative failure... we get to move the low pointer, but not my much
|
||||
low=probe+1;
|
||||
|
||||
// reprobe worth it? it appears so!
|
||||
probe = low + step;
|
||||
if (probe<high) {
|
||||
if (b[probe]>=doca) {
|
||||
high=probe;
|
||||
} else {
|
||||
low=probe+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// binary search
|
||||
while (low <= high) {
|
||||
int mid = (low+high) >>> 1;
|
||||
int docb = b[mid];
|
||||
|
||||
if (docb < doca) {
|
||||
low = mid+1;
|
||||
}
|
||||
else if (docb > doca) {
|
||||
high = mid-1;
|
||||
}
|
||||
else {
|
||||
target[icount++] = doca;
|
||||
low = mid+1; // found it, so start at next element
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Didn't find it... low is now positioned on the insertion point,
|
||||
// which is higher than what we were looking for, so continue using
|
||||
// the same low point.
|
||||
}
|
||||
|
||||
return icount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocSet intersection(DocSet other) {
|
||||
if (!(other instanceof SortedIntDocSet)) {
|
||||
int icount = 0;
|
||||
int arr[] = new int[docs.length];
|
||||
for (int i=0; i<docs.length; i++) {
|
||||
int doc = docs[i];
|
||||
if (other.exists(doc)) arr[icount++] = doc;
|
||||
}
|
||||
return new SortedIntDocSet(arr,icount);
|
||||
}
|
||||
|
||||
int[] otherDocs = ((SortedIntDocSet)other).docs;
|
||||
int maxsz = Math.min(docs.length, otherDocs.length);
|
||||
int[] arr = new int[maxsz];
|
||||
int sz = intersection(docs, docs.length, otherDocs, otherDocs.length, arr);
|
||||
return new SortedIntDocSet(arr,sz);
|
||||
}
|
||||
|
||||
|
||||
protected static int andNotBinarySearch(int a[], int lena, int b[], int lenb, int[] target) {
|
||||
int step = (lenb/lena)+1;
|
||||
step = step + step;
|
||||
|
||||
|
||||
int count = 0;
|
||||
int low = 0;
|
||||
int max = lenb-1;
|
||||
|
||||
outer:
|
||||
for (int i=0; i<lena; i++) {
|
||||
int doca = a[i];
|
||||
|
||||
int high = max;
|
||||
|
||||
int probe = low + step; // 40% improvement!
|
||||
|
||||
// short linear probe to see if we can drop the high pointer in one big jump.
|
||||
if (probe<high) {
|
||||
if (b[probe]>=doca) {
|
||||
// success! we cut down the upper bound by a lot in one step!
|
||||
high=probe;
|
||||
} else {
|
||||
// relative failure... we get to move the low pointer, but not my much
|
||||
low=probe+1;
|
||||
|
||||
// reprobe worth it? it appears so!
|
||||
probe = low + step;
|
||||
if (probe<high) {
|
||||
if (b[probe]>=doca) {
|
||||
high=probe;
|
||||
} else {
|
||||
low=probe+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// binary search
|
||||
while (low <= high) {
|
||||
int mid = (low+high) >>> 1;
|
||||
int docb = b[mid];
|
||||
|
||||
if (docb < doca) {
|
||||
low = mid+1;
|
||||
}
|
||||
else if (docb > doca) {
|
||||
high = mid-1;
|
||||
}
|
||||
else {
|
||||
low = mid+1; // found it, so start at next element
|
||||
continue outer;
|
||||
}
|
||||
}
|
||||
// Didn't find it... low is now positioned on the insertion point,
|
||||
// which is higher than what we were looking for, so continue using
|
||||
// the same low point.
|
||||
target[count++] = doca;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
/** puts the intersection of a and not b into the target array and returns the size */
|
||||
public static int andNot(int a[], int lena, int b[], int lenb, int[] target) {
|
||||
if (lena==0) return 0;
|
||||
if (lenb==0) {
|
||||
System.arraycopy(a,0,target,0,lena);
|
||||
return lena;
|
||||
}
|
||||
|
||||
// if b is 8 times bigger than a, use the modified binary search.
|
||||
if ((lenb>>3) >= lena) {
|
||||
return andNotBinarySearch(a, lena, b, lenb, target);
|
||||
}
|
||||
|
||||
int count=0;
|
||||
int i=0,j=0;
|
||||
int doca=a[i],docb=b[j];
|
||||
for(;;) {
|
||||
if (doca > docb) {
|
||||
if (++j >= lenb) break;
|
||||
docb=b[j];
|
||||
} else if (doca < docb) {
|
||||
target[count++] = doca;
|
||||
if (++i >= lena) break;
|
||||
doca=a[i];
|
||||
} else {
|
||||
if (++i >= lena) break;
|
||||
doca=a[i];
|
||||
if (++j >= lenb) break;
|
||||
docb=b[j];
|
||||
}
|
||||
}
|
||||
|
||||
int leftover=lena - i;
|
||||
|
||||
if (leftover > 0) {
|
||||
System.arraycopy(a,i,target,count,leftover);
|
||||
count += leftover;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocSet andNot(DocSet other) {
|
||||
if (other.size()==0) return this;
|
||||
|
||||
if (!(other instanceof SortedIntDocSet)) {
|
||||
int count = 0;
|
||||
int arr[] = new int[docs.length];
|
||||
for (int i=0; i<docs.length; i++) {
|
||||
int doc = docs[i];
|
||||
if (!other.exists(doc)) arr[count++] = doc;
|
||||
}
|
||||
return new SortedIntDocSet(arr,count);
|
||||
}
|
||||
|
||||
int[] otherDocs = ((SortedIntDocSet)other).docs;
|
||||
int[] arr = new int[docs.length];
|
||||
int sz = andNot(docs, docs.length, otherDocs, otherDocs.length, arr);
|
||||
return new SortedIntDocSet(arr,sz);
|
||||
}
|
||||
|
||||
|
||||
public boolean exists(int doc) {
|
||||
// this could be faster by estimating where in the list the doc is likely to appear,
|
||||
// but we should get away from using exists() anyway.
|
||||
int low = 0;
|
||||
int high = docs.length-1;
|
||||
// binary search
|
||||
while (low <= high) {
|
||||
int mid = (low+high) >>> 1;
|
||||
int docb = docs[mid];
|
||||
|
||||
if (docb < doc) {
|
||||
low = mid+1;
|
||||
}
|
||||
else if (docb > doc) {
|
||||
high = mid-1;
|
||||
}
|
||||
else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public DocIterator iterator() {
|
||||
return new DocIterator() {
|
||||
int pos=0;
|
||||
public boolean hasNext() {
|
||||
return pos < docs.length;
|
||||
}
|
||||
|
||||
public Integer next() {
|
||||
return nextDoc();
|
||||
}
|
||||
|
||||
/**
|
||||
* The remove operation is not supported by this Iterator.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("The remove operation is not supported by this Iterator.");
|
||||
}
|
||||
|
||||
public int nextDoc() {
|
||||
return docs[pos++];
|
||||
}
|
||||
|
||||
public float score() {
|
||||
return 0.0f;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenBitSet getBits() {
|
||||
int maxDoc = size() > 0 ? docs[size()-1] : 0;
|
||||
OpenBitSet bs = new OpenBitSet(maxDoc+1);
|
||||
for (int doc : docs) {
|
||||
bs.fastSet(doc);
|
||||
}
|
||||
return bs;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -20,6 +20,7 @@ package org.apache.solr.search;
|
|||
import junit.framework.TestCase;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.OpenBitSetIterator;
|
||||
|
@ -49,41 +50,75 @@ public class TestDocSet extends TestCase {
|
|||
return new HashDocSet(docs,0,docs.length);
|
||||
}
|
||||
|
||||
public DocSet getIntDocSet(OpenBitSet bs) {
|
||||
int[] docs = new int[(int)bs.cardinality()];
|
||||
OpenBitSetIterator iter = new OpenBitSetIterator(bs);
|
||||
for (int i=0; i<docs.length; i++) {
|
||||
docs[i] = iter.nextDoc();
|
||||
}
|
||||
return new SortedIntDocSet(docs);
|
||||
}
|
||||
|
||||
|
||||
public DocSet getBitDocSet(OpenBitSet bs) {
|
||||
return new BitDocSet(bs);
|
||||
}
|
||||
|
||||
public DocSet getDocSet(OpenBitSet bs) {
|
||||
return rand.nextInt(2)==0 ? getHashDocSet(bs) : getBitDocSet(bs);
|
||||
switch(rand.nextInt(3)) {
|
||||
case 0: return getIntDocSet(bs);
|
||||
case 1: return getHashDocSet(bs);
|
||||
case 2: return getBitDocSet(bs);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void checkEqual(OpenBitSet bs, DocSet set) {
|
||||
for (int i=0; i<bs.capacity(); i++) {
|
||||
assertEquals(bs.get(i), set.exists(i));
|
||||
}
|
||||
assertEquals(bs.cardinality(), set.size());
|
||||
}
|
||||
|
||||
public void iter(DocSet d1, DocSet d2) {
|
||||
// HashDocSet doesn't iterate in order.
|
||||
if (d1 instanceof HashDocSet || d2 instanceof HashDocSet) return;
|
||||
|
||||
DocIterator i1 = d1.iterator();
|
||||
DocIterator i2 = d2.iterator();
|
||||
|
||||
assert(i1.hasNext() == i2.hasNext());
|
||||
|
||||
for(;;) {
|
||||
boolean b1 = i1.hasNext();
|
||||
boolean b2 = i2.hasNext();
|
||||
assertEquals(b1,b2);
|
||||
if (!b1) break;
|
||||
assertEquals(i1.nextDoc(), i2.nextDoc());
|
||||
}
|
||||
}
|
||||
|
||||
protected void doSingle(int maxSize) {
|
||||
int sz = rand.nextInt(maxSize+1);
|
||||
int sz2 = rand.nextInt(maxSize);
|
||||
OpenBitSet a1 = getRandomSet(sz, rand.nextInt(sz+1));
|
||||
OpenBitSet a2 = getRandomSet(sz, rand.nextInt(sz2+1));
|
||||
OpenBitSet bs1 = getRandomSet(sz, rand.nextInt(sz+1));
|
||||
OpenBitSet bs2 = getRandomSet(sz, rand.nextInt(sz2+1));
|
||||
|
||||
DocSet b1 = getDocSet(a1);
|
||||
DocSet b2 = getDocSet(a2);
|
||||
DocSet a1 = new BitDocSet(bs1);
|
||||
DocSet a2 = new BitDocSet(bs2);
|
||||
DocSet b1 = getDocSet(bs1);
|
||||
DocSet b2 = getDocSet(bs2);
|
||||
|
||||
// System.out.println("b1="+b1+", b2="+b2);
|
||||
checkEqual(bs1,b1);
|
||||
checkEqual(bs2,b2);
|
||||
|
||||
assertEquals((int)a1.cardinality(), b1.size());
|
||||
assertEquals((int)a2.cardinality(), b2.size());
|
||||
iter(a1,b1);
|
||||
iter(a2,b2);
|
||||
|
||||
checkEqual(a1,b1);
|
||||
checkEqual(a2,b2);
|
||||
|
||||
OpenBitSet a_and = (OpenBitSet)a1.clone(); a_and.and(a2);
|
||||
OpenBitSet a_or = (OpenBitSet)a1.clone(); a_or.or(a2);
|
||||
// OpenBitSet a_xor = (OpenBitSet)a1.clone(); a_xor.xor(a2);
|
||||
OpenBitSet a_andn = (OpenBitSet)a1.clone(); a_andn.andNot(a2);
|
||||
OpenBitSet a_and = (OpenBitSet) bs1.clone(); a_and.and(bs2);
|
||||
OpenBitSet a_or = (OpenBitSet) bs1.clone(); a_or.or(bs2);
|
||||
// OpenBitSet a_xor = (OpenBitSet)bs1.clone(); a_xor.xor(bs2);
|
||||
OpenBitSet a_andn = (OpenBitSet) bs1.clone(); a_andn.andNot(bs2);
|
||||
|
||||
checkEqual(a_and, b1.intersection(b2));
|
||||
checkEqual(a_or, b1.union(b2));
|
||||
|
@ -102,12 +137,15 @@ public class TestDocSet extends TestCase {
|
|||
}
|
||||
|
||||
public void testRandomDocSets() {
|
||||
doMany(300, 5000);
|
||||
// Make the size big enough to go over certain limits (such as one set
|
||||
// being 8 times the size of another in the int set, or going over 2 times
|
||||
// 64 bits for the bit doc set. Smaller sets can hit more boundary conditions though.
|
||||
|
||||
doMany(130, 10000);
|
||||
//doMany(130, 1000000);
|
||||
}
|
||||
|
||||
|
||||
public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
|
||||
int n = rand.nextInt(maxSetSize);
|
||||
public DocSet getRandomDocSet(int n, int maxDoc) {
|
||||
OpenBitSet obs = new OpenBitSet(maxDoc);
|
||||
int[] a = new int[n];
|
||||
for (int i=0; i<n; i++) {
|
||||
|
@ -118,14 +156,29 @@ public class TestDocSet extends TestCase {
|
|||
break;
|
||||
}
|
||||
}
|
||||
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
|
||||
|
||||
if (n <= smallSetCuttoff) {
|
||||
if (smallSetType ==0) {
|
||||
Arrays.sort(a);
|
||||
return new SortedIntDocSet(a);
|
||||
} else if (smallSetType ==1) {
|
||||
Arrays.sort(a);
|
||||
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
|
||||
}
|
||||
}
|
||||
|
||||
return new BitDocSet(obs, n);
|
||||
}
|
||||
|
||||
public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
|
||||
public DocSet[] getRandomSets(int nSets, int minSetSize, int maxSetSize, int maxDoc) {
|
||||
DocSet[] sets = new DocSet[nSets];
|
||||
|
||||
for (int i=0; i<nSets; i++) {
|
||||
sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
|
||||
int sz;
|
||||
sz = rand.nextInt(maxSetSize-minSetSize+1)+minSetSize;
|
||||
// different distribution
|
||||
// sz = (maxSetSize+1)/(rand.nextInt(maxSetSize)+1) + minSetSize;
|
||||
sets[i] = getRandomDocSet(sz,maxDoc);
|
||||
}
|
||||
|
||||
return sets;
|
||||
|
@ -160,30 +213,43 @@ public class TestDocSet extends TestCase {
|
|||
}
|
||||
***/
|
||||
|
||||
public static int smallSetType = 0; // 0==sortedint, 1==hash, 2==openbitset
|
||||
public static int smallSetCuttoff=3000;
|
||||
|
||||
/***
|
||||
public void testIntersectionSizePerformance() {
|
||||
loadfactor=.75f;
|
||||
rand=new Random(12345); // make deterministic
|
||||
int maxSetsize=4000;
|
||||
int nSets=128;
|
||||
int iter=10;
|
||||
loadfactor=.75f; // for HashDocSet
|
||||
rand=new Random(1); // make deterministic
|
||||
|
||||
int minBigSetSize=1,maxBigSetSize=30000;
|
||||
int minSmallSetSize=1,maxSmallSetSize=30000;
|
||||
int nSets=1024;
|
||||
int iter=1;
|
||||
int maxDoc=1000000;
|
||||
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
||||
|
||||
|
||||
smallSetCuttoff = maxDoc>>6; // break even for SortedIntSet is /32... but /64 is better for performance
|
||||
// smallSetCuttoff = maxDoc;
|
||||
|
||||
|
||||
DocSet[] bigsets = getRandomSets(nSets, minBigSetSize, maxBigSetSize, maxDoc);
|
||||
DocSet[] smallsets = getRandomSets(nSets, minSmallSetSize, maxSmallSetSize, maxDoc);
|
||||
int ret=0;
|
||||
long start=System.currentTimeMillis();
|
||||
for (int i=0; i<iter; i++) {
|
||||
for (DocSet s1 : sets) {
|
||||
for (DocSet s2 : sets) {
|
||||
for (DocSet s1 : bigsets) {
|
||||
for (DocSet s2 : smallsets) {
|
||||
ret += s1.intersectionSize(s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
long end=System.currentTimeMillis();
|
||||
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
|
||||
if (ret==-1)System.out.println("wow!");
|
||||
System.out.println("intersectionSizePerformance="+(end-start)+" ms");
|
||||
System.out.println("ret="+ret);
|
||||
}
|
||||
***/
|
||||
|
||||
|
||||
/****
|
||||
public void testExistsPerformance() {
|
||||
loadfactor=.75f;
|
||||
rand=new Random(12345); // make deterministic
|
||||
|
|
Loading…
Reference in New Issue