SOLR-7214: optimize multi-valued counts-only case

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1669712 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2015-03-28 00:02:40 +00:00
parent 0e2def6f5c
commit a4274dff9a
4 changed files with 133 additions and 255 deletions

View File

@ -134,10 +134,12 @@ public class FacetField extends FacetRequest {
abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
SchemaField sf;
SlotAcc sortAcc;
int effectiveMincount;
FacetFieldProcessor(FacetContext fcontext, FacetField freq, SchemaField sf) {
super(fcontext, freq);
this.sf = sf;
this.effectiveMincount = (int)(fcontext.isShard() ? Math.min(1 , freq.mincount) : freq.mincount);
}
@Override
@ -252,7 +254,6 @@ abstract class FacetFieldProcessorFCBase extends FacetFieldProcessor {
};
Slot bottom = null;
int effectiveMincount = (int)(fcontext.isShard() ? Math.min(1 , freq.mincount) : freq.mincount);
for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) {
if (countAcc.getCount(i) < effectiveMincount) {
continue;
@ -304,8 +305,8 @@ abstract class FacetFieldProcessorFCBase extends FacetFieldProcessor {
if (freq.allBuckets) {
SimpleOrderedMap<Object> allBuckets = new SimpleOrderedMap<>();
for (SlotAcc acc : accs) {
countAcc.setValues(allBuckets, allBucketsSlot);
for (SlotAcc acc : accs) {
acc.setValues(allBuckets, allBucketsSlot);
}
res.add("allBuckets", allBuckets);
@ -648,7 +649,6 @@ class FacetFieldProcessorStream extends FacetFieldProcessor implements Closeable
}
public SimpleOrderedMap<Object> _nextBucket() throws IOException {
int effectiveMincount = (int)(fcontext.isShard() ? Math.min(1 , freq.mincount) : freq.mincount);
DocSet termSet = null;
try {

View File

@ -290,6 +290,11 @@ class CountSlotAcc extends IntSlotAcc {
return result[slot];
}
// internal and expert
int[] getCountArray() {
return result;
}
@Override
public void reset() {
super.reset();

View File

@ -28,37 +28,23 @@ import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.uninverting.DocTermOrds;
import org.apache.lucene.uninverting.UninvertingReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.FieldFacetStats;
import org.apache.solr.handler.component.StatsField;
import org.apache.solr.handler.component.StatsValues;
import org.apache.solr.handler.component.StatsValuesFactory;
import org.apache.solr.request.SimpleFacets;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.LongPriorityQueue;
import org.apache.solr.util.PrimUtils;
/**
*
@ -94,6 +80,7 @@ public class UnInvertedField extends DocTermOrds {
private static int TNUM_OFFSET=2;
static class TopTerm {
Query termQuery;
BytesRef term;
int termNum;
@ -127,7 +114,7 @@ public class UnInvertedField extends DocTermOrds {
if (termNum >= maxTermCounts.length) {
// resize by doubling - for very large number of unique terms, expanding
// by 4K and resultant GC will dominate uninvert times. Resize at end if material
int[] newMaxTermCounts = new int[maxTermCounts.length*2];
int[] newMaxTermCounts = new int[ Math.min(Integer.MAX_VALUE-16, maxTermCounts.length*2) ];
System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum);
maxTermCounts = newMaxTermCounts;
}
@ -138,6 +125,8 @@ public class UnInvertedField extends DocTermOrds {
TopTerm topTerm = new TopTerm();
topTerm.term = BytesRef.deepCopyOf(term);
topTerm.termNum = termNum;
topTerm.termQuery = new TermQuery(new Term(field, topTerm.term));
bigTerms.put(topTerm.termNum, topTerm);
if (deState == null) {
@ -188,7 +177,6 @@ public class UnInvertedField extends DocTermOrds {
// small.
searcher.maxDoc()/20 + 2,
DEFAULT_INDEX_INTERVAL_BITS);
//System.out.println("maxTermDocFreq=" + maxTermDocFreq + " maxDoc=" + searcher.maxDoc());
final String prefix = TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field));
this.searcher = searcher;
@ -235,7 +223,7 @@ public class UnInvertedField extends DocTermOrds {
bigTermNums = new int[bigTerms.size()];
int i=0;
for (TopTerm tt : bigTerms.values()) {
bigTermSets[i] = searcher.getDocSet(new TermQuery(new Term(field, tt.term)));
bigTermSets[i] = searcher.getDocSet(tt.termQuery);
bigTermNums[i] = tt.termNum;
i++;
}
@ -315,130 +303,69 @@ public class UnInvertedField extends DocTermOrds {
}
public NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException {
use.incrementAndGet();
FieldType ft = searcher.getSchema().getFieldType(field);
NamedList<Integer> res = new NamedList<>(); // order is important
DocSet docs = baseDocs;
private void getCountsInArray(FacetFieldProcessorUIF processor, int[] counts) throws IOException {
DocSet docs = processor.fcontext.base;
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
//System.out.println("GET COUNTS field=" + field + " baseSize=" + baseSize + " minCount=" + mincount + " maxDoc=" + maxDoc + " numTermsInField=" + numTermsInField);
if (baseSize >= mincount) {
if (baseSize < processor.effectiveMincount) {
return;
}
final int[] index = this.index;
// tricky: we add more more element than we need because we will reuse this array later
// for ordering term ords before converting to term labels.
final int[] counts = new int[numTermsInField + 1];
//
// If there is prefix, find its start and end term numbers
//
int startTerm = 0;
int endTerm = numTermsInField; // one past the end
TermsEnum te = getOrdTermsEnum(searcher.getLeafReader());
if (te != null && prefix != null && prefix.length() > 0) {
final BytesRefBuilder prefixBr = new BytesRefBuilder();
prefixBr.copyChars(prefix);
if (te.seekCeil(prefixBr.get()) == TermsEnum.SeekStatus.END) {
startTerm = numTermsInField;
} else {
startTerm = (int) te.ord();
}
prefixBr.append(UnicodeUtil.BIG_TERM);
if (te.seekCeil(prefixBr.get()) == TermsEnum.SeekStatus.END) {
endTerm = numTermsInField;
} else {
endTerm = (int) te.ord();
}
}
/***********
// Alternative 2: get the docSet of the prefix (could take a while) and
// then do the intersection with the baseDocSet first.
if (prefix != null && prefix.length() > 0) {
docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs);
// The issue with this method are problems of returning 0 counts for terms w/o
// the prefix. We can't just filter out those terms later because it may
// mean that we didn't collect enough terms in the queue (in the sorted case).
}
***********/
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0
&& startTerm==0 && endTerm==numTermsInField
&& docs instanceof BitDocSet;
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet;
if (doNegative) {
FixedBitSet bs = ((BitDocSet)docs).getBits().clone();
FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
bs.flip(0, maxDoc);
// TODO: when iterator across negative elements is available, use that
// instead of creating a new bitset and inverting.
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
//System.out.println(" NEG");
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
//System.out.println(" do big termNum=" + tt.termNum + " term=" + tt.term.utf8ToString());
// TODO: counts could be deferred if sorted==false
if (tt.termNum >= startTerm && tt.termNum < endTerm) {
counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(field, tt.term)), docs);
//System.out.println(" count=" + counts[tt.termNum]);
} else {
//System.out.println("SKIP term=" + tt.termNum);
}
// TODO: counts could be deferred if sorting by index order
counts[tt.termNum] = searcher.numDocs(tt.termQuery, docs);
}
// TODO: we could short-circuit counting altogether for sorted faceting
// where we already have enough terms from the bigTerms
// TODO: we could shrink the size of the collection array, and
// additionally break when the termNumber got above endTerm, but
// it would require two extra conditionals in the inner loop (although
// they would be predictable for the non-prefix case).
// Perhaps a different copy of the code would be warranted.
if (termInstances > 0) {
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int doc = iter.nextDoc();
//System.out.println("iter doc=" + doc);
int code = index[doc];
if ((code & 0xff)==1) {
//System.out.println(" ptr");
int pos = code>>>8;
if ((code & 0xff) == 1) {
int pos = code >>> 8;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
for(;;) {
for (; ; ) {
int delta = 0;
for(;;) {
for (; ; ) {
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0) break;
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
//System.out.println(" tnum=" + tnum);
counts[tnum]++;
}
} else {
//System.out.println(" inlined");
int tnum = 0;
int delta = 0;
for (;;) {
for (; ; ) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80)==0) {
if (delta==0) break;
if ((code & 0x80) == 0) {
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
//System.out.println(" tnum=" + tnum);
counts[tnum]++;
delta = 0;
}
@ -447,139 +374,77 @@ public class UnInvertedField extends DocTermOrds {
}
}
}
final CharsRefBuilder charsRef = new CharsRefBuilder();
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) {
int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
maxsize = Math.min(maxsize, numTermsInField);
LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE);
int min=mincount-1; // the smallest value in the top 'N' values
//System.out.println("START=" + startTerm + " END=" + endTerm);
for (int i=startTerm; i<endTerm; i++) {
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
if (c>min) {
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
// smaller term numbers sort higher, so subtract the term number instead
long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i);
boolean displaced = queue.insert(pair);
if (displaced) min=(int)(queue.top() >>> 32);
if (doNegative) {
for (int i=0; i<numTermsInField; i++) {
counts[i] = maxTermCounts[i] - counts[i];
}
}
// now select the right page from the results
// if we are deep paging, we don't have to order the highest "offset" counts.
int collectCount = Math.max(0, queue.size() - off);
assert collectCount <= lim;
// the start and end indexes of our list "sorted" (starting with the highest value)
int sortedIdxStart = queue.size() - (collectCount - 1);
int sortedIdxEnd = queue.size() + 1;
final long[] sorted = queue.sort(collectCount);
final int[] indirect = counts; // reuse the counts array for the index into the tnums array
assert indirect.length >= sortedIdxEnd;
for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
long pair = sorted[i];
int c = (int)(pair >>> 32);
int tnum = Integer.MAX_VALUE - (int)pair;
indirect[i] = i; // store the index for indirect sorting
sorted[i] = tnum; // reuse the "sorted" array to store the term numbers for indirect sorting
// add a null label for now... we'll fill it in later.
res.add(null, c);
if (processor.allBucketsSlot >= 0) {
int all = 0; // overflow potential
for (int i=0; i<numTermsInField; i++) {
all += counts[i];
}
// now sort the indexes by the term numbers
PrimUtils.sort(sortedIdxStart, sortedIdxEnd, indirect, new PrimUtils.IntComparator() {
@Override
public int compare(int a, int b) {
return (int)sorted[a] - (int)sorted[b];
}
@Override
public boolean lessThan(int a, int b) {
return sorted[a] < sorted[b];
}
@Override
public boolean equals(int a, int b) {
return sorted[a] == sorted[b];
}
});
// convert the term numbers to term values and set
// as the label
//System.out.println("sortStart=" + sortedIdxStart + " end=" + sortedIdxEnd);
for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
int idx = indirect[i];
int tnum = (int)sorted[idx];
final String label = getReadableValue(getTermValue(te, tnum), ft, charsRef);
//System.out.println(" label=" + label);
res.setName(idx - sortedIdxStart, label);
}
} else {
// add results in index order
int i=startTerm;
if (mincount<=0) {
// if mincount<=0, then we won't discard any terms and we know exactly
// where to start.
i=startTerm+off;
off=0;
}
for (; i<endTerm; i++) {
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
if (c<mincount || --off>=0) continue;
if (--lim<0) break;
final String label = getReadableValue(getTermValue(te, i), ft, charsRef);
res.add(label, c);
counts[processor.allBucketsSlot] = all;
}
}
public void collectDocs(FacetFieldProcessorUIF processor) throws IOException {
if (processor.accs.length == 0 && processor.startTermIndex == 0 && processor.endTermIndex >= numTermsInField)
{
int[] arr = processor.countAcc.getCountArray();
getCountsInArray(processor, arr);
/*** debugging
int sz = processor.countAcc.getCountArray().length;
CountSlotAcc acc = processor.countAcc;
CountSlotAcc acc2 = new CountSlotAcc(processor.fcontext, sz);
processor.countAcc = acc2;
collectDocsGeneric(processor); // hopefully we can call this again?
for (int i=0; i<sz; i++) {
if (acc.getCount(i) != acc2.getCount(i)) {
System.out.println("ERROR! ERROR! i=" + i + " counts=" + acc.getCount(i) + " " + acc2.getCount(i));
CountSlotAcc acc3 = new CountSlotAcc(processor.fcontext, sz); // put breakpoint here and re-execute
processor.countAcc = acc3;
int[] arr3 = processor.countAcc.getCountArray();
getCountsInArray(processor, arr3);
}
}
***/
return;
}
if (missing) {
// TODO: a faster solution for this?
res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field));
collectDocsGeneric(processor);
}
//System.out.println(" res=" + res);
return res;
}
// called from FieldFacetProcessor
// TODO: do a callback version that can be specialized!
public void collectDocs(FacetFieldProcessorUIF processor) throws IOException {
public void collectDocsGeneric(FacetFieldProcessorUIF processor) throws IOException {
use.incrementAndGet();
DocSet docs = processor.fcontext.base;
int startTermIndex = processor.startTermIndex;
int endTermIndex = processor.endTermIndex;
int nTerms = processor.nTerms;
DocSet docs = processor.fcontext.base;
int uniqueTerms = 0;
for (TopTerm tt : bigTerms.values()) {
if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
// handle the biggest terms
try ( DocSet intersection = searcher.getDocSet(new TermQuery(new Term(field, tt.term)), docs); )
try ( DocSet intersection = searcher.getDocSet(tt.termQuery, docs); )
{
int collected = processor.collect(intersection, tt.termNum - startTermIndex);
processor.countAcc.incrementCount(tt.termNum - startTermIndex, collected);
if (processor.allBucketsSlot >= 0) {
processor.collect(intersection, processor.allBucketsSlot);
processor.countAcc.incrementCount(processor.allBucketsSlot, collected);
}
if (collected > 0) {
uniqueTerms++;
}

View File

@ -27,6 +27,8 @@ import java.util.Random;
import com.tdunning.math.stats.AVLTreeDigest;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.solr.JSONTestUtil;
import org.apache.solr.SolrTestCaseHS;
import org.apache.solr.common.SolrInputDocument;
@ -619,6 +621,14 @@ public class TestJsonFacets extends SolrTestCaseHS {
" }"
);
// test allBucket multi-valued
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{x:{terms:{field:'${multi_ss}',allBuckets:true}}}"
)
, "facets=={ count:6, " +
"x:{ buckets:[{val:a, count:3}, {val:b, count:3}] , allBuckets:{count:6} } }"
);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// test converting legacy facets
@ -697,8 +707,8 @@ public class TestJsonFacets extends SolrTestCaseHS {
doStats( client, params() );
}
/***
public void testPercentiles() {
public void XtestPercentiles() {
AVLTreeDigest catA = new AVLTreeDigest(100);
catA.add(4);
catA.add(2);
@ -728,11 +738,10 @@ public class TestJsonFacets extends SolrTestCaseHS {
}
return sb.toString();
}
***/
/*** test code to ensure TDigest is working as we expect.
@Test
public void testTDigest() throws Exception {
/*** test code to ensure TDigest is working as we expect. */
public void XtestTDigest() throws Exception {
AVLTreeDigest t1 = new AVLTreeDigest(100);
t1.add(10, 1);
t1.add(90, 1);
@ -779,5 +788,4 @@ public class TestJsonFacets extends SolrTestCaseHS {
System.out.println(top.quantile(0.5));
System.out.println(top.quantile(0.9));
}
******/
}