mirror of https://github.com/apache/lucene.git
SOLR-1169: SortedIntDocSet
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@776750 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d161e54136
commit
06f159e3fb
|
@ -254,6 +254,11 @@ Optimizations
|
||||||
10. SOLR-1166: Speed up docset/filter generation by avoiding top-level
|
10. SOLR-1166: Speed up docset/filter generation by avoiding top-level
|
||||||
score() call and iterating over leaf readers with TermDocs. (yonik)
|
score() call and iterating over leaf readers with TermDocs. (yonik)
|
||||||
|
|
||||||
|
11. SOLR-1169: SortedIntDocSet - a new small set implementation
|
||||||
|
that saves memory over HashDocSet, is faster to construct,
|
||||||
|
is ordered for easier impelemntation of skipTo, and is faster
|
||||||
|
in the general case. (yonik)
|
||||||
|
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -306,12 +306,6 @@
|
||||||
queryResultCache. -->
|
queryResultCache. -->
|
||||||
<queryResultMaxDocsCached>200</queryResultMaxDocsCached>
|
<queryResultMaxDocsCached>200</queryResultMaxDocsCached>
|
||||||
|
|
||||||
<!-- This entry enables an int hash representation for filters (DocSets)
|
|
||||||
when the number of items in the set is less than maxSize. For smaller
|
|
||||||
sets, this representation is more memory efficient, more efficient to
|
|
||||||
iterate over, and faster to take intersections. -->
|
|
||||||
<HashDocSet maxSize="3000" loadFactor="0.75"/>
|
|
||||||
|
|
||||||
<!-- a newSearcher event is fired whenever a new searcher is being prepared
|
<!-- a newSearcher event is fired whenever a new searcher is being prepared
|
||||||
and there is a current searcher handling requests (aka registered). -->
|
and there is a current searcher handling requests (aka registered). -->
|
||||||
<!-- QuerySenderListener takes an array of NamedList and executes a
|
<!-- QuerySenderListener takes an array of NamedList and executes a
|
||||||
|
|
|
@ -189,10 +189,9 @@ abstract class DocSetBase implements DocSet {
|
||||||
};
|
};
|
||||||
|
|
||||||
public DocSet intersection(DocSet other) {
|
public DocSet intersection(DocSet other) {
|
||||||
// intersection is overloaded in HashDocSet to be more
|
// intersection is overloaded in the smaller DocSets to be more
|
||||||
// efficient, so if "other" is a HashDocSet, dispatch off
|
// efficient, so dispatch off of it instead.
|
||||||
// of it instead.
|
if (!(other instanceof BitDocSet)) {
|
||||||
if (other instanceof HashDocSet) {
|
|
||||||
return other.intersection(this);
|
return other.intersection(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -209,10 +208,9 @@ abstract class DocSetBase implements DocSet {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int intersectionSize(DocSet other) {
|
public int intersectionSize(DocSet other) {
|
||||||
// intersectionSize is overloaded in HashDocSet to be more
|
// intersection is overloaded in the smaller DocSets to be more
|
||||||
// efficient, so if "other" is a HashDocSet, dispatch off
|
// efficient, so dispatch off of it instead.
|
||||||
// of it instead.
|
if (!(other instanceof BitDocSet)) {
|
||||||
if (other instanceof HashDocSet) {
|
|
||||||
return other.intersectionSize(this);
|
return other.intersectionSize(this);
|
||||||
}
|
}
|
||||||
// less efficient way: do the intersection then get it's size
|
// less efficient way: do the intersection then get it's size
|
||||||
|
|
|
@ -30,31 +30,26 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
final class DocSetHitCollector extends HitCollector {
|
final class DocSetHitCollector extends HitCollector {
|
||||||
|
|
||||||
final float HASHSET_INVERSE_LOAD_FACTOR;
|
|
||||||
final int HASHDOCSET_MAXSIZE;
|
|
||||||
|
|
||||||
int pos=0;
|
int pos=0;
|
||||||
OpenBitSet bits;
|
OpenBitSet bits;
|
||||||
final int maxDoc;
|
final int maxDoc;
|
||||||
|
final int smallSetSize;
|
||||||
|
|
||||||
// in case there aren't that many hits, we may not want a very sparse
|
// in case there aren't that many hits, we may not want a very sparse
|
||||||
// bit array. Optimistically collect the first few docs in an array
|
// bit array. Optimistically collect the first few docs in an array
|
||||||
// in case there are only a few.
|
// in case there are only a few.
|
||||||
final int[] scratch;
|
final int[] scratch;
|
||||||
|
|
||||||
// todo - could pass in bitset and an operation also...
|
DocSetHitCollector(int smallSetSize, int maxDoc) {
|
||||||
DocSetHitCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
|
this.smallSetSize = smallSetSize;
|
||||||
this.maxDoc = maxDoc;
|
this.maxDoc = maxDoc;
|
||||||
HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
|
this.scratch = new int[smallSetSize];
|
||||||
HASHDOCSET_MAXSIZE = maxSize;
|
|
||||||
scratch = new int[HASHDOCSET_MAXSIZE];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void collect(int doc, float score) {
|
public void collect(int doc, float score) {
|
||||||
// optimistically collect the first docs in an array
|
// optimistically collect the first docs in an array
|
||||||
// in case the total number will be small enough to represent
|
// in case the total number will be small enough to represent
|
||||||
// as a HashDocSet() instead...
|
// as a small set like SortedIntDocSet instead...
|
||||||
// Storing in this array will be quicker to convert
|
// Storing in this array will be quicker to convert
|
||||||
// than scanning through a potentially huge bit vector.
|
// than scanning through a potentially huge bit vector.
|
||||||
// FUTURE: when search methods all start returning docs in order, maybe
|
// FUTURE: when search methods all start returning docs in order, maybe
|
||||||
|
@ -73,7 +68,8 @@ final class DocSetHitCollector extends HitCollector {
|
||||||
|
|
||||||
public DocSet getDocSet() {
|
public DocSet getDocSet() {
|
||||||
if (pos<=scratch.length) {
|
if (pos<=scratch.length) {
|
||||||
return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
|
// assumes docs were collected in sorted order!
|
||||||
|
return new SortedIntDocSet(scratch, pos);
|
||||||
} else {
|
} else {
|
||||||
// set the bits for ids that were collected in the array
|
// set the bits for ids that were collected in the array
|
||||||
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
|
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
|
||||||
|
@ -84,33 +80,27 @@ final class DocSetHitCollector extends HitCollector {
|
||||||
|
|
||||||
|
|
||||||
class DocSetCollector extends Collector {
|
class DocSetCollector extends Collector {
|
||||||
|
|
||||||
final float HASHSET_INVERSE_LOAD_FACTOR;
|
|
||||||
final int HASHDOCSET_MAXSIZE;
|
|
||||||
|
|
||||||
int pos=0;
|
int pos=0;
|
||||||
OpenBitSet bits;
|
OpenBitSet bits;
|
||||||
final int maxDoc;
|
final int maxDoc;
|
||||||
int base=0;
|
final int smallSetSize;
|
||||||
|
int base;
|
||||||
|
|
||||||
// in case there aren't that many hits, we may not want a very sparse
|
// in case there aren't that many hits, we may not want a very sparse
|
||||||
// bit array. Optimistically collect the first few docs in an array
|
// bit array. Optimistically collect the first few docs in an array
|
||||||
// in case there are only a few.
|
// in case there are only a few.
|
||||||
final int[] scratch;
|
final int[] scratch;
|
||||||
|
|
||||||
// todo - could pass in bitset and an operation also...
|
DocSetCollector(int smallSetSize, int maxDoc) {
|
||||||
DocSetCollector(float inverseLoadFactor, int maxSize, int maxDoc) {
|
this.smallSetSize = smallSetSize;
|
||||||
this.maxDoc = maxDoc;
|
this.maxDoc = maxDoc;
|
||||||
HASHSET_INVERSE_LOAD_FACTOR = inverseLoadFactor;
|
this.scratch = new int[smallSetSize];
|
||||||
HASHDOCSET_MAXSIZE = maxSize;
|
|
||||||
scratch = new int[HASHDOCSET_MAXSIZE];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void collect(int doc) {
|
public void collect(int doc) {
|
||||||
doc += base;
|
doc += base;
|
||||||
// optimistically collect the first docs in an array
|
// optimistically collect the first docs in an array
|
||||||
// in case the total number will be small enough to represent
|
// in case the total number will be small enough to represent
|
||||||
// as a HashDocSet() instead...
|
// as a small set like SortedIntDocSet instead...
|
||||||
// Storing in this array will be quicker to convert
|
// Storing in this array will be quicker to convert
|
||||||
// than scanning through a potentially huge bit vector.
|
// than scanning through a potentially huge bit vector.
|
||||||
// FUTURE: when search methods all start returning docs in order, maybe
|
// FUTURE: when search methods all start returning docs in order, maybe
|
||||||
|
@ -129,7 +119,8 @@ class DocSetCollector extends Collector {
|
||||||
|
|
||||||
public DocSet getDocSet() {
|
public DocSet getDocSet() {
|
||||||
if (pos<=scratch.length) {
|
if (pos<=scratch.length) {
|
||||||
return new HashDocSet(scratch,0,pos,HASHSET_INVERSE_LOAD_FACTOR);
|
// assumes docs were collected in sorted order!
|
||||||
|
return new SortedIntDocSet(scratch, pos);
|
||||||
} else {
|
} else {
|
||||||
// set the bits for ids that were collected in the array
|
// set the bits for ids that were collected in the array
|
||||||
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
|
for (int i=0; i<scratch.length; i++) bits.fastSet(scratch[i]);
|
||||||
|
|
|
@ -82,9 +82,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
|
|
||||||
private final LuceneQueryOptimizer optimizer;
|
private final LuceneQueryOptimizer optimizer;
|
||||||
|
|
||||||
private final float HASHSET_INVERSE_LOAD_FACTOR;
|
|
||||||
private final int HASHDOCSET_MAXSIZE;
|
|
||||||
|
|
||||||
// map of generic caches - not synchronized since it's read-only after the constructor.
|
// map of generic caches - not synchronized since it's read-only after the constructor.
|
||||||
private final HashMap<String, SolrCache> cacheMap;
|
private final HashMap<String, SolrCache> cacheMap;
|
||||||
private static final HashMap<String, SolrCache> noGenericCaches=new HashMap<String,SolrCache>(0);
|
private static final HashMap<String, SolrCache> noGenericCaches=new HashMap<String,SolrCache>(0);
|
||||||
|
@ -186,10 +183,6 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
}
|
}
|
||||||
optimizer = solrConfig.filtOptEnabled ? new LuceneQueryOptimizer(solrConfig.filtOptCacheSize,solrConfig.filtOptThreshold) : null;
|
optimizer = solrConfig.filtOptEnabled ? new LuceneQueryOptimizer(solrConfig.filtOptCacheSize,solrConfig.filtOptThreshold) : null;
|
||||||
|
|
||||||
// for DocSets
|
|
||||||
HASHSET_INVERSE_LOAD_FACTOR = solrConfig.hashSetInverseLoadFactor;
|
|
||||||
HASHDOCSET_MAXSIZE = solrConfig.hashDocSetMaxSize;
|
|
||||||
|
|
||||||
fieldNames = r.getFieldNames(IndexReader.FieldOption.ALL);
|
fieldNames = r.getFieldNames(IndexReader.FieldOption.ALL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -628,7 +621,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
// query must be positive
|
// query must be positive
|
||||||
protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
|
protected DocSet getDocSetNC(Query query, DocSet filter) throws IOException {
|
||||||
if (filter==null) {
|
if (filter==null) {
|
||||||
DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
|
DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
|
||||||
if (query instanceof TermQuery) {
|
if (query instanceof TermQuery) {
|
||||||
Term t = ((TermQuery)query).getTerm();
|
Term t = ((TermQuery)query).getTerm();
|
||||||
SolrIndexReader[] readers = reader.getLeafReaders();
|
SolrIndexReader[] readers = reader.getLeafReaders();
|
||||||
|
@ -656,7 +649,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// FUTURE: if the filter is sorted by docid, could use skipTo (SkipQueryFilter)
|
// FUTURE: if the filter is sorted by docid, could use skipTo (SkipQueryFilter)
|
||||||
final DocSetCollector hc = new DocSetCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
|
final DocSetCollector hc = new DocSetCollector(maxDoc()>>6, maxDoc());
|
||||||
final DocSet filt = filter;
|
final DocSet filt = filter;
|
||||||
super.search(query, null, new Collector() {
|
super.search(query, null, new Collector() {
|
||||||
int base = 0;
|
int base = 0;
|
||||||
|
@ -1131,7 +1124,7 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean {
|
||||||
float maxScore;
|
float maxScore;
|
||||||
int[] ids;
|
int[] ids;
|
||||||
float[] scores;
|
float[] scores;
|
||||||
final DocSetHitCollector setHC = new DocSetHitCollector(HASHSET_INVERSE_LOAD_FACTOR, HASHDOCSET_MAXSIZE, maxDoc());
|
final DocSetHitCollector setHC = new DocSetHitCollector(maxDoc()>>6, maxDoc());
|
||||||
final HitCollector collector = ( cmd.getTimeAllowed() > 0 ) ? new TimeLimitedCollector( setHC, cmd.getTimeAllowed() ) : setHC;
|
final HitCollector collector = ( cmd.getTimeAllowed() > 0 ) ? new TimeLimitedCollector( setHC, cmd.getTimeAllowed() ) : setHC;
|
||||||
|
|
||||||
Query query = QueryUtils.makeQueryable(cmd.getQuery());
|
Query query = QueryUtils.makeQueryable(cmd.getQuery());
|
||||||
|
|
|
@ -0,0 +1,498 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.search;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.OpenBitSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <code>SortedIntDocSet</code> represents a sorted set of Lucene Document Ids.
|
||||||
|
*/
|
||||||
|
public class SortedIntDocSet extends DocSetBase {
|
||||||
|
protected final int[] docs;
|
||||||
|
|
||||||
|
public SortedIntDocSet(int[] docs) {
|
||||||
|
this.docs = docs;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SortedIntDocSet(int[] docs, int len) {
|
||||||
|
this(shrink(docs,len));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int[] getDocs() { return docs; }
|
||||||
|
|
||||||
|
public int size() { return docs.length; }
|
||||||
|
|
||||||
|
public long memSize() {
|
||||||
|
return (docs.length<<2)+8;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int[] zeroInts = new int[0];
|
||||||
|
public static SortedIntDocSet zero = new SortedIntDocSet(zeroInts);
|
||||||
|
|
||||||
|
public static int[] shrink(int[] arr, int newSize) {
|
||||||
|
if (arr.length == newSize) return arr;
|
||||||
|
int[] newArr = new int[newSize];
|
||||||
|
System.arraycopy(arr, 0, newArr, 0, newSize);
|
||||||
|
return newArr;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int intersectionSize(int[] smallerSortedList, int[] biggerSortedList) {
|
||||||
|
final int a[] = smallerSortedList;
|
||||||
|
final int b[] = biggerSortedList;
|
||||||
|
|
||||||
|
// The next doc we are looking for will be much closer to the last position we tried
|
||||||
|
// than it will be to the midpoint between last and high... so probe ahead using
|
||||||
|
// a function of the ratio of the sizes of the sets.
|
||||||
|
int step = (b.length/a.length)+1;
|
||||||
|
|
||||||
|
// Since the majority of probes should be misses, we'll already be above the last probe
|
||||||
|
// and shouldn't need to move larger than the step size on average to step over our target (and thus lower
|
||||||
|
// the high upper bound a lot.)... but if we don't go over our target, it's a big miss... so double it.
|
||||||
|
step = step + step;
|
||||||
|
|
||||||
|
// FUTURE: come up with a density such that target * density == likely position?
|
||||||
|
// then check step on one side or the other?
|
||||||
|
// (density could be cached in the DocSet)... length/maxDoc
|
||||||
|
|
||||||
|
// FUTURE: try partitioning like a sort algorithm. Pick the midpoint of the big
|
||||||
|
// array, find where that should be in the small array, and then recurse with
|
||||||
|
// the top and bottom half of both arrays until they are small enough to use
|
||||||
|
// a fallback insersection method.
|
||||||
|
// NOTE: I tried this and it worked, but it was actually slower than this current
|
||||||
|
// highly optimized approach.
|
||||||
|
|
||||||
|
int icount = 0;
|
||||||
|
int low = 0;
|
||||||
|
int max = b.length-1;
|
||||||
|
|
||||||
|
for (int i=0; i<a.length; i++) {
|
||||||
|
int doca = a[i];
|
||||||
|
|
||||||
|
int high = max;
|
||||||
|
|
||||||
|
int probe = low + step; // 40% improvement!
|
||||||
|
|
||||||
|
// short linear probe to see if we can drop the high pointer in one big jump.
|
||||||
|
if (probe<high) {
|
||||||
|
if (b[probe]>=doca) {
|
||||||
|
// success! we cut down the upper bound by a lot in one step!
|
||||||
|
high=probe;
|
||||||
|
} else {
|
||||||
|
// relative failure... we get to move the low pointer, but not my much
|
||||||
|
low=probe+1;
|
||||||
|
|
||||||
|
// reprobe worth it? it appears so!
|
||||||
|
probe = low + step;
|
||||||
|
if (probe<high) {
|
||||||
|
if (b[probe]>=doca) {
|
||||||
|
high=probe;
|
||||||
|
} else {
|
||||||
|
low=probe+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// binary search the rest of the way
|
||||||
|
while (low <= high) {
|
||||||
|
int mid = (low+high) >>> 1;
|
||||||
|
int docb = b[mid];
|
||||||
|
|
||||||
|
if (docb < doca) {
|
||||||
|
low = mid+1;
|
||||||
|
}
|
||||||
|
else if (docb > doca) {
|
||||||
|
high = mid-1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
icount++;
|
||||||
|
low = mid+1; // found it, so start at next element
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Didn't find it... low is now positioned on the insertion point,
|
||||||
|
// which is higher than what we were looking for, so continue using
|
||||||
|
// the same low point.
|
||||||
|
}
|
||||||
|
|
||||||
|
return icount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int intersectionSize(DocSet other) {
|
||||||
|
if (!(other instanceof SortedIntDocSet)) {
|
||||||
|
// assume other implementations are better at random access than we are,
|
||||||
|
// true of BitDocSet and HashDocSet.
|
||||||
|
int icount = 0;
|
||||||
|
for (int i=0; i<docs.length; i++) {
|
||||||
|
if (other.exists(docs[i])) icount++;
|
||||||
|
}
|
||||||
|
return icount;
|
||||||
|
}
|
||||||
|
|
||||||
|
// make "a" the smaller set.
|
||||||
|
int[] otherDocs = ((SortedIntDocSet)other).docs;
|
||||||
|
final int[] a = docs.length < otherDocs.length ? docs : otherDocs;
|
||||||
|
final int[] b = docs.length < otherDocs.length ? otherDocs : docs;
|
||||||
|
|
||||||
|
if (a.length==0) return 0;
|
||||||
|
|
||||||
|
// if b is 8 times bigger than a, use the modified binary search.
|
||||||
|
if ((b.length>>3) >= a.length) {
|
||||||
|
return intersectionSize(a,b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// if they are close in size, just do a linear walk of both.
|
||||||
|
int icount=0;
|
||||||
|
int i=0,j=0;
|
||||||
|
int doca=a[i],docb=b[j];
|
||||||
|
for(;;) {
|
||||||
|
// switch on the sign bit somehow? Hopefull JVM is smart enough to just test once.
|
||||||
|
|
||||||
|
// Since set a is less dense then set b, doca is likely to be greater than docb so
|
||||||
|
// check that case first. This resulted in a 13% speedup.
|
||||||
|
if (doca > docb) {
|
||||||
|
if (++j >= b.length) break;
|
||||||
|
docb=b[j];
|
||||||
|
} else if (doca < docb) {
|
||||||
|
if (++i >= a.length) break;
|
||||||
|
doca=a[i];
|
||||||
|
} else {
|
||||||
|
icount++;
|
||||||
|
if (++i >= a.length) break;
|
||||||
|
doca=a[i];
|
||||||
|
if (++j >= b.length) break;
|
||||||
|
docb=b[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return icount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** puts the intersection of a and b into the target array and returns the size */
|
||||||
|
public static int intersection(int a[], int lena, int b[], int lenb, int[] target) {
|
||||||
|
if (lena > lenb) {
|
||||||
|
int ti=lena; lena=lenb; lenb=ti;
|
||||||
|
int[] ta=a; a=b; b=ta;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lena==0) return 0;
|
||||||
|
|
||||||
|
|
||||||
|
// if b is 8 times bigger than a, use the modified binary search.
|
||||||
|
if ((lenb>>3) >= lena) {
|
||||||
|
return intersectionBinarySearch(a, lena, b, lenb, target);
|
||||||
|
}
|
||||||
|
|
||||||
|
int icount=0;
|
||||||
|
int i=0,j=0;
|
||||||
|
int doca=a[i],docb=b[j];
|
||||||
|
for(;;) {
|
||||||
|
if (doca > docb) {
|
||||||
|
if (++j >= lenb) break;
|
||||||
|
docb=b[j];
|
||||||
|
} else if (doca < docb) {
|
||||||
|
if (++i >= lena) break;
|
||||||
|
doca=a[i];
|
||||||
|
} else {
|
||||||
|
target[icount++] = doca;
|
||||||
|
if (++i >= lena) break;
|
||||||
|
doca=a[i];
|
||||||
|
if (++j >= lenb) break;
|
||||||
|
docb=b[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return icount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Puts the intersection of a and b into the target array and returns the size.
|
||||||
|
* lena should be smaller than lenb */
|
||||||
|
protected static int intersectionBinarySearch(int[] a, int lena, int[] b, int lenb, int[] target) {
|
||||||
|
int step = (lenb/lena)+1;
|
||||||
|
step = step + step;
|
||||||
|
|
||||||
|
|
||||||
|
int icount = 0;
|
||||||
|
int low = 0;
|
||||||
|
int max = lenb-1;
|
||||||
|
|
||||||
|
for (int i=0; i<lena; i++) {
|
||||||
|
int doca = a[i];
|
||||||
|
|
||||||
|
int high = max;
|
||||||
|
|
||||||
|
int probe = low + step; // 40% improvement!
|
||||||
|
|
||||||
|
// short linear probe to see if we can drop the high pointer in one big jump.
|
||||||
|
if (probe<high) {
|
||||||
|
if (b[probe]>=doca) {
|
||||||
|
// success! we cut down the upper bound by a lot in one step!
|
||||||
|
high=probe;
|
||||||
|
} else {
|
||||||
|
// relative failure... we get to move the low pointer, but not my much
|
||||||
|
low=probe+1;
|
||||||
|
|
||||||
|
// reprobe worth it? it appears so!
|
||||||
|
probe = low + step;
|
||||||
|
if (probe<high) {
|
||||||
|
if (b[probe]>=doca) {
|
||||||
|
high=probe;
|
||||||
|
} else {
|
||||||
|
low=probe+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// binary search
|
||||||
|
while (low <= high) {
|
||||||
|
int mid = (low+high) >>> 1;
|
||||||
|
int docb = b[mid];
|
||||||
|
|
||||||
|
if (docb < doca) {
|
||||||
|
low = mid+1;
|
||||||
|
}
|
||||||
|
else if (docb > doca) {
|
||||||
|
high = mid-1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
target[icount++] = doca;
|
||||||
|
low = mid+1; // found it, so start at next element
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Didn't find it... low is now positioned on the insertion point,
|
||||||
|
// which is higher than what we were looking for, so continue using
|
||||||
|
// the same low point.
|
||||||
|
}
|
||||||
|
|
||||||
|
return icount;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocSet intersection(DocSet other) {
|
||||||
|
if (!(other instanceof SortedIntDocSet)) {
|
||||||
|
int icount = 0;
|
||||||
|
int arr[] = new int[docs.length];
|
||||||
|
for (int i=0; i<docs.length; i++) {
|
||||||
|
int doc = docs[i];
|
||||||
|
if (other.exists(doc)) arr[icount++] = doc;
|
||||||
|
}
|
||||||
|
return new SortedIntDocSet(arr,icount);
|
||||||
|
}
|
||||||
|
|
||||||
|
int[] otherDocs = ((SortedIntDocSet)other).docs;
|
||||||
|
int maxsz = Math.min(docs.length, otherDocs.length);
|
||||||
|
int[] arr = new int[maxsz];
|
||||||
|
int sz = intersection(docs, docs.length, otherDocs, otherDocs.length, arr);
|
||||||
|
return new SortedIntDocSet(arr,sz);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected static int andNotBinarySearch(int a[], int lena, int b[], int lenb, int[] target) {
|
||||||
|
int step = (lenb/lena)+1;
|
||||||
|
step = step + step;
|
||||||
|
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
int low = 0;
|
||||||
|
int max = lenb-1;
|
||||||
|
|
||||||
|
outer:
|
||||||
|
for (int i=0; i<lena; i++) {
|
||||||
|
int doca = a[i];
|
||||||
|
|
||||||
|
int high = max;
|
||||||
|
|
||||||
|
int probe = low + step; // 40% improvement!
|
||||||
|
|
||||||
|
// short linear probe to see if we can drop the high pointer in one big jump.
|
||||||
|
if (probe<high) {
|
||||||
|
if (b[probe]>=doca) {
|
||||||
|
// success! we cut down the upper bound by a lot in one step!
|
||||||
|
high=probe;
|
||||||
|
} else {
|
||||||
|
// relative failure... we get to move the low pointer, but not my much
|
||||||
|
low=probe+1;
|
||||||
|
|
||||||
|
// reprobe worth it? it appears so!
|
||||||
|
probe = low + step;
|
||||||
|
if (probe<high) {
|
||||||
|
if (b[probe]>=doca) {
|
||||||
|
high=probe;
|
||||||
|
} else {
|
||||||
|
low=probe+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// binary search
|
||||||
|
while (low <= high) {
|
||||||
|
int mid = (low+high) >>> 1;
|
||||||
|
int docb = b[mid];
|
||||||
|
|
||||||
|
if (docb < doca) {
|
||||||
|
low = mid+1;
|
||||||
|
}
|
||||||
|
else if (docb > doca) {
|
||||||
|
high = mid-1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
low = mid+1; // found it, so start at next element
|
||||||
|
continue outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Didn't find it... low is now positioned on the insertion point,
|
||||||
|
// which is higher than what we were looking for, so continue using
|
||||||
|
// the same low point.
|
||||||
|
target[count++] = doca;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** puts the intersection of a and not b into the target array and returns the size */
|
||||||
|
public static int andNot(int a[], int lena, int b[], int lenb, int[] target) {
|
||||||
|
if (lena==0) return 0;
|
||||||
|
if (lenb==0) {
|
||||||
|
System.arraycopy(a,0,target,0,lena);
|
||||||
|
return lena;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if b is 8 times bigger than a, use the modified binary search.
|
||||||
|
if ((lenb>>3) >= lena) {
|
||||||
|
return andNotBinarySearch(a, lena, b, lenb, target);
|
||||||
|
}
|
||||||
|
|
||||||
|
int count=0;
|
||||||
|
int i=0,j=0;
|
||||||
|
int doca=a[i],docb=b[j];
|
||||||
|
for(;;) {
|
||||||
|
if (doca > docb) {
|
||||||
|
if (++j >= lenb) break;
|
||||||
|
docb=b[j];
|
||||||
|
} else if (doca < docb) {
|
||||||
|
target[count++] = doca;
|
||||||
|
if (++i >= lena) break;
|
||||||
|
doca=a[i];
|
||||||
|
} else {
|
||||||
|
if (++i >= lena) break;
|
||||||
|
doca=a[i];
|
||||||
|
if (++j >= lenb) break;
|
||||||
|
docb=b[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int leftover=lena - i;
|
||||||
|
|
||||||
|
if (leftover > 0) {
|
||||||
|
System.arraycopy(a,i,target,count,leftover);
|
||||||
|
count += leftover;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocSet andNot(DocSet other) {
|
||||||
|
if (other.size()==0) return this;
|
||||||
|
|
||||||
|
if (!(other instanceof SortedIntDocSet)) {
|
||||||
|
int count = 0;
|
||||||
|
int arr[] = new int[docs.length];
|
||||||
|
for (int i=0; i<docs.length; i++) {
|
||||||
|
int doc = docs[i];
|
||||||
|
if (!other.exists(doc)) arr[count++] = doc;
|
||||||
|
}
|
||||||
|
return new SortedIntDocSet(arr,count);
|
||||||
|
}
|
||||||
|
|
||||||
|
int[] otherDocs = ((SortedIntDocSet)other).docs;
|
||||||
|
int[] arr = new int[docs.length];
|
||||||
|
int sz = andNot(docs, docs.length, otherDocs, otherDocs.length, arr);
|
||||||
|
return new SortedIntDocSet(arr,sz);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean exists(int doc) {
|
||||||
|
// this could be faster by estimating where in the list the doc is likely to appear,
|
||||||
|
// but we should get away from using exists() anyway.
|
||||||
|
int low = 0;
|
||||||
|
int high = docs.length-1;
|
||||||
|
// binary search
|
||||||
|
while (low <= high) {
|
||||||
|
int mid = (low+high) >>> 1;
|
||||||
|
int docb = docs[mid];
|
||||||
|
|
||||||
|
if (docb < doc) {
|
||||||
|
low = mid+1;
|
||||||
|
}
|
||||||
|
else if (docb > doc) {
|
||||||
|
high = mid-1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public DocIterator iterator() {
|
||||||
|
return new DocIterator() {
|
||||||
|
int pos=0;
|
||||||
|
public boolean hasNext() {
|
||||||
|
return pos < docs.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer next() {
|
||||||
|
return nextDoc();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The remove operation is not supported by this Iterator.
|
||||||
|
*/
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("The remove operation is not supported by this Iterator.");
|
||||||
|
}
|
||||||
|
|
||||||
|
public int nextDoc() {
|
||||||
|
return docs[pos++];
|
||||||
|
}
|
||||||
|
|
||||||
|
public float score() {
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OpenBitSet getBits() {
|
||||||
|
int maxDoc = size() > 0 ? docs[size()-1] : 0;
|
||||||
|
OpenBitSet bs = new OpenBitSet(maxDoc+1);
|
||||||
|
for (int doc : docs) {
|
||||||
|
bs.fastSet(doc);
|
||||||
|
}
|
||||||
|
return bs;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.solr.search;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.util.OpenBitSet;
|
import org.apache.lucene.util.OpenBitSet;
|
||||||
import org.apache.lucene.util.OpenBitSetIterator;
|
import org.apache.lucene.util.OpenBitSetIterator;
|
||||||
|
@ -49,41 +50,75 @@ public class TestDocSet extends TestCase {
|
||||||
return new HashDocSet(docs,0,docs.length);
|
return new HashDocSet(docs,0,docs.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DocSet getIntDocSet(OpenBitSet bs) {
|
||||||
|
int[] docs = new int[(int)bs.cardinality()];
|
||||||
|
OpenBitSetIterator iter = new OpenBitSetIterator(bs);
|
||||||
|
for (int i=0; i<docs.length; i++) {
|
||||||
|
docs[i] = iter.nextDoc();
|
||||||
|
}
|
||||||
|
return new SortedIntDocSet(docs);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public DocSet getBitDocSet(OpenBitSet bs) {
|
public DocSet getBitDocSet(OpenBitSet bs) {
|
||||||
return new BitDocSet(bs);
|
return new BitDocSet(bs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocSet getDocSet(OpenBitSet bs) {
|
public DocSet getDocSet(OpenBitSet bs) {
|
||||||
return rand.nextInt(2)==0 ? getHashDocSet(bs) : getBitDocSet(bs);
|
switch(rand.nextInt(3)) {
|
||||||
|
case 0: return getIntDocSet(bs);
|
||||||
|
case 1: return getHashDocSet(bs);
|
||||||
|
case 2: return getBitDocSet(bs);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void checkEqual(OpenBitSet bs, DocSet set) {
|
public void checkEqual(OpenBitSet bs, DocSet set) {
|
||||||
for (int i=0; i<bs.capacity(); i++) {
|
for (int i=0; i<bs.capacity(); i++) {
|
||||||
assertEquals(bs.get(i), set.exists(i));
|
assertEquals(bs.get(i), set.exists(i));
|
||||||
}
|
}
|
||||||
|
assertEquals(bs.cardinality(), set.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void iter(DocSet d1, DocSet d2) {
|
||||||
|
// HashDocSet doesn't iterate in order.
|
||||||
|
if (d1 instanceof HashDocSet || d2 instanceof HashDocSet) return;
|
||||||
|
|
||||||
|
DocIterator i1 = d1.iterator();
|
||||||
|
DocIterator i2 = d2.iterator();
|
||||||
|
|
||||||
|
assert(i1.hasNext() == i2.hasNext());
|
||||||
|
|
||||||
|
for(;;) {
|
||||||
|
boolean b1 = i1.hasNext();
|
||||||
|
boolean b2 = i2.hasNext();
|
||||||
|
assertEquals(b1,b2);
|
||||||
|
if (!b1) break;
|
||||||
|
assertEquals(i1.nextDoc(), i2.nextDoc());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void doSingle(int maxSize) {
|
protected void doSingle(int maxSize) {
|
||||||
int sz = rand.nextInt(maxSize+1);
|
int sz = rand.nextInt(maxSize+1);
|
||||||
int sz2 = rand.nextInt(maxSize);
|
int sz2 = rand.nextInt(maxSize);
|
||||||
OpenBitSet a1 = getRandomSet(sz, rand.nextInt(sz+1));
|
OpenBitSet bs1 = getRandomSet(sz, rand.nextInt(sz+1));
|
||||||
OpenBitSet a2 = getRandomSet(sz, rand.nextInt(sz2+1));
|
OpenBitSet bs2 = getRandomSet(sz, rand.nextInt(sz2+1));
|
||||||
|
|
||||||
DocSet b1 = getDocSet(a1);
|
DocSet a1 = new BitDocSet(bs1);
|
||||||
DocSet b2 = getDocSet(a2);
|
DocSet a2 = new BitDocSet(bs2);
|
||||||
|
DocSet b1 = getDocSet(bs1);
|
||||||
|
DocSet b2 = getDocSet(bs2);
|
||||||
|
|
||||||
// System.out.println("b1="+b1+", b2="+b2);
|
checkEqual(bs1,b1);
|
||||||
|
checkEqual(bs2,b2);
|
||||||
|
|
||||||
assertEquals((int)a1.cardinality(), b1.size());
|
iter(a1,b1);
|
||||||
assertEquals((int)a2.cardinality(), b2.size());
|
iter(a2,b2);
|
||||||
|
|
||||||
checkEqual(a1,b1);
|
OpenBitSet a_and = (OpenBitSet) bs1.clone(); a_and.and(bs2);
|
||||||
checkEqual(a2,b2);
|
OpenBitSet a_or = (OpenBitSet) bs1.clone(); a_or.or(bs2);
|
||||||
|
// OpenBitSet a_xor = (OpenBitSet)bs1.clone(); a_xor.xor(bs2);
|
||||||
OpenBitSet a_and = (OpenBitSet)a1.clone(); a_and.and(a2);
|
OpenBitSet a_andn = (OpenBitSet) bs1.clone(); a_andn.andNot(bs2);
|
||||||
OpenBitSet a_or = (OpenBitSet)a1.clone(); a_or.or(a2);
|
|
||||||
// OpenBitSet a_xor = (OpenBitSet)a1.clone(); a_xor.xor(a2);
|
|
||||||
OpenBitSet a_andn = (OpenBitSet)a1.clone(); a_andn.andNot(a2);
|
|
||||||
|
|
||||||
checkEqual(a_and, b1.intersection(b2));
|
checkEqual(a_and, b1.intersection(b2));
|
||||||
checkEqual(a_or, b1.union(b2));
|
checkEqual(a_or, b1.union(b2));
|
||||||
|
@ -102,12 +137,15 @@ public class TestDocSet extends TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testRandomDocSets() {
|
public void testRandomDocSets() {
|
||||||
doMany(300, 5000);
|
// Make the size big enough to go over certain limits (such as one set
|
||||||
|
// being 8 times the size of another in the int set, or going over 2 times
|
||||||
|
// 64 bits for the bit doc set. Smaller sets can hit more boundary conditions though.
|
||||||
|
|
||||||
|
doMany(130, 10000);
|
||||||
|
//doMany(130, 1000000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DocSet getRandomDocSet(int n, int maxDoc) {
|
||||||
public HashDocSet getRandomHashDocset(int maxSetSize, int maxDoc) {
|
|
||||||
int n = rand.nextInt(maxSetSize);
|
|
||||||
OpenBitSet obs = new OpenBitSet(maxDoc);
|
OpenBitSet obs = new OpenBitSet(maxDoc);
|
||||||
int[] a = new int[n];
|
int[] a = new int[n];
|
||||||
for (int i=0; i<n; i++) {
|
for (int i=0; i<n; i++) {
|
||||||
|
@ -118,14 +156,29 @@ public class TestDocSet extends TestCase {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
|
|
||||||
|
if (n <= smallSetCuttoff) {
|
||||||
|
if (smallSetType ==0) {
|
||||||
|
Arrays.sort(a);
|
||||||
|
return new SortedIntDocSet(a);
|
||||||
|
} else if (smallSetType ==1) {
|
||||||
|
Arrays.sort(a);
|
||||||
|
return loadfactor!=0 ? new HashDocSet(a,0,n,1/loadfactor) : new HashDocSet(a,0,n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new BitDocSet(obs, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocSet[] getRandomHashSets(int nSets, int maxSetSize, int maxDoc) {
|
public DocSet[] getRandomSets(int nSets, int minSetSize, int maxSetSize, int maxDoc) {
|
||||||
DocSet[] sets = new DocSet[nSets];
|
DocSet[] sets = new DocSet[nSets];
|
||||||
|
|
||||||
for (int i=0; i<nSets; i++) {
|
for (int i=0; i<nSets; i++) {
|
||||||
sets[i] = getRandomHashDocset(maxSetSize,maxDoc);
|
int sz;
|
||||||
|
sz = rand.nextInt(maxSetSize-minSetSize+1)+minSetSize;
|
||||||
|
// different distribution
|
||||||
|
// sz = (maxSetSize+1)/(rand.nextInt(maxSetSize)+1) + minSetSize;
|
||||||
|
sets[i] = getRandomDocSet(sz,maxDoc);
|
||||||
}
|
}
|
||||||
|
|
||||||
return sets;
|
return sets;
|
||||||
|
@ -160,30 +213,43 @@ public class TestDocSet extends TestCase {
|
||||||
}
|
}
|
||||||
***/
|
***/
|
||||||
|
|
||||||
|
public static int smallSetType = 0; // 0==sortedint, 1==hash, 2==openbitset
|
||||||
|
public static int smallSetCuttoff=3000;
|
||||||
|
|
||||||
/***
|
/***
|
||||||
public void testIntersectionSizePerformance() {
|
public void testIntersectionSizePerformance() {
|
||||||
loadfactor=.75f;
|
loadfactor=.75f; // for HashDocSet
|
||||||
rand=new Random(12345); // make deterministic
|
rand=new Random(1); // make deterministic
|
||||||
int maxSetsize=4000;
|
|
||||||
int nSets=128;
|
int minBigSetSize=1,maxBigSetSize=30000;
|
||||||
int iter=10;
|
int minSmallSetSize=1,maxSmallSetSize=30000;
|
||||||
|
int nSets=1024;
|
||||||
|
int iter=1;
|
||||||
int maxDoc=1000000;
|
int maxDoc=1000000;
|
||||||
DocSet[] sets = getRandomHashSets(nSets,maxSetsize, maxDoc);
|
|
||||||
|
|
||||||
|
smallSetCuttoff = maxDoc>>6; // break even for SortedIntSet is /32... but /64 is better for performance
|
||||||
|
// smallSetCuttoff = maxDoc;
|
||||||
|
|
||||||
|
|
||||||
|
DocSet[] bigsets = getRandomSets(nSets, minBigSetSize, maxBigSetSize, maxDoc);
|
||||||
|
DocSet[] smallsets = getRandomSets(nSets, minSmallSetSize, maxSmallSetSize, maxDoc);
|
||||||
int ret=0;
|
int ret=0;
|
||||||
long start=System.currentTimeMillis();
|
long start=System.currentTimeMillis();
|
||||||
for (int i=0; i<iter; i++) {
|
for (int i=0; i<iter; i++) {
|
||||||
for (DocSet s1 : sets) {
|
for (DocSet s1 : bigsets) {
|
||||||
for (DocSet s2 : sets) {
|
for (DocSet s2 : smallsets) {
|
||||||
ret += s1.intersectionSize(s2);
|
ret += s1.intersectionSize(s2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
long end=System.currentTimeMillis();
|
long end=System.currentTimeMillis();
|
||||||
System.out.println("testIntersectionSizePerformance="+(end-start)+" ms");
|
System.out.println("intersectionSizePerformance="+(end-start)+" ms");
|
||||||
if (ret==-1)System.out.println("wow!");
|
System.out.println("ret="+ret);
|
||||||
}
|
}
|
||||||
|
***/
|
||||||
|
|
||||||
|
/****
|
||||||
public void testExistsPerformance() {
|
public void testExistsPerformance() {
|
||||||
loadfactor=.75f;
|
loadfactor=.75f;
|
||||||
rand=new Random(12345); // make deterministic
|
rand=new Random(12345); // make deterministic
|
||||||
|
|
Loading…
Reference in New Issue