From a4274dff9acd2eb3f30d2d96dc6224f645cb44f8 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Sat, 28 Mar 2015 00:02:40 +0000 Subject: [PATCH] SOLR-7214: optimize multi-valued counts-only case git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1669712 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/solr/search/facet/FacetField.java | 6 +- .../org/apache/solr/search/facet/SlotAcc.java | 5 + .../solr/search/facet/UnInvertedField.java | 355 ++++++------------ .../solr/search/facet/TestJsonFacets.java | 22 +- 4 files changed, 133 insertions(+), 255 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetField.java b/solr/core/src/java/org/apache/solr/search/facet/FacetField.java index daede77f9c3..c81cabded9a 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetField.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetField.java @@ -134,10 +134,12 @@ public class FacetField extends FacetRequest { abstract class FacetFieldProcessor extends FacetProcessor { SchemaField sf; SlotAcc sortAcc; + int effectiveMincount; FacetFieldProcessor(FacetContext fcontext, FacetField freq, SchemaField sf) { super(fcontext, freq); this.sf = sf; + this.effectiveMincount = (int)(fcontext.isShard() ? Math.min(1 , freq.mincount) : freq.mincount); } @Override @@ -252,7 +254,6 @@ abstract class FacetFieldProcessorFCBase extends FacetFieldProcessor { }; Slot bottom = null; - int effectiveMincount = (int)(fcontext.isShard() ? Math.min(1 , freq.mincount) : freq.mincount); for (int i = (startTermIndex == -1) ? 1 : 0; i < nTerms; i++) { if (countAcc.getCount(i) < effectiveMincount) { continue; @@ -304,8 +305,8 @@ abstract class FacetFieldProcessorFCBase extends FacetFieldProcessor { if (freq.allBuckets) { SimpleOrderedMap allBuckets = new SimpleOrderedMap<>(); + countAcc.setValues(allBuckets, allBucketsSlot); for (SlotAcc acc : accs) { - countAcc.setValues(allBuckets, allBucketsSlot); acc.setValues(allBuckets, allBucketsSlot); } res.add("allBuckets", allBuckets); @@ -648,7 +649,6 @@ class FacetFieldProcessorStream extends FacetFieldProcessor implements Closeable } public SimpleOrderedMap _nextBucket() throws IOException { - int effectiveMincount = (int)(fcontext.isShard() ? Math.min(1 , freq.mincount) : freq.mincount); DocSet termSet = null; try { diff --git a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java index 9485388843e..aa6750e4690 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java @@ -290,6 +290,11 @@ class CountSlotAcc extends IntSlotAcc { return result[slot]; } + // internal and expert + int[] getCountArray() { + return result; + } + @Override public void reset() { super.reset(); diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java index 6b4ac8b201e..d239918475e 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java @@ -28,37 +28,23 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SlowCompositeReaderWrapper; -import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.uninverting.DocTermOrds; -import org.apache.lucene.uninverting.UninvertingReader; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.UnicodeUtil; import org.apache.solr.common.SolrException; -import org.apache.solr.common.params.FacetParams; -import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; -import org.apache.solr.handler.component.FieldFacetStats; -import org.apache.solr.handler.component.StatsField; -import org.apache.solr.handler.component.StatsValues; -import org.apache.solr.handler.component.StatsValuesFactory; -import org.apache.solr.request.SimpleFacets; import org.apache.solr.schema.FieldType; -import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.TrieField; import org.apache.solr.search.BitDocSet; import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocSet; import org.apache.solr.search.SolrCache; import org.apache.solr.search.SolrIndexSearcher; -import org.apache.solr.util.LongPriorityQueue; -import org.apache.solr.util.PrimUtils; /** * @@ -94,6 +80,7 @@ public class UnInvertedField extends DocTermOrds { private static int TNUM_OFFSET=2; static class TopTerm { + Query termQuery; BytesRef term; int termNum; @@ -127,7 +114,7 @@ public class UnInvertedField extends DocTermOrds { if (termNum >= maxTermCounts.length) { // resize by doubling - for very large number of unique terms, expanding // by 4K and resultant GC will dominate uninvert times. Resize at end if material - int[] newMaxTermCounts = new int[maxTermCounts.length*2]; + int[] newMaxTermCounts = new int[ Math.min(Integer.MAX_VALUE-16, maxTermCounts.length*2) ]; System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum); maxTermCounts = newMaxTermCounts; } @@ -138,6 +125,8 @@ public class UnInvertedField extends DocTermOrds { TopTerm topTerm = new TopTerm(); topTerm.term = BytesRef.deepCopyOf(term); topTerm.termNum = termNum; + topTerm.termQuery = new TermQuery(new Term(field, topTerm.term)); + bigTerms.put(topTerm.termNum, topTerm); if (deState == null) { @@ -188,7 +177,6 @@ public class UnInvertedField extends DocTermOrds { // small. searcher.maxDoc()/20 + 2, DEFAULT_INDEX_INTERVAL_BITS); - //System.out.println("maxTermDocFreq=" + maxTermDocFreq + " maxDoc=" + searcher.maxDoc()); final String prefix = TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field)); this.searcher = searcher; @@ -235,7 +223,7 @@ public class UnInvertedField extends DocTermOrds { bigTermNums = new int[bigTerms.size()]; int i=0; for (TopTerm tt : bigTerms.values()) { - bigTermSets[i] = searcher.getDocSet(new TermQuery(new Term(field, tt.term))); + bigTermSets[i] = searcher.getDocSet(tt.termQuery); bigTermNums[i] = tt.termNum; i++; } @@ -315,271 +303,148 @@ public class UnInvertedField extends DocTermOrds { } - public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException { - use.incrementAndGet(); - FieldType ft = searcher.getSchema().getFieldType(field); - - NamedList res = new NamedList<>(); // order is important - - DocSet docs = baseDocs; + private void getCountsInArray(FacetFieldProcessorUIF processor, int[] counts) throws IOException { + DocSet docs = processor.fcontext.base; int baseSize = docs.size(); int maxDoc = searcher.maxDoc(); - //System.out.println("GET COUNTS field=" + field + " baseSize=" + baseSize + " minCount=" + mincount + " maxDoc=" + maxDoc + " numTermsInField=" + numTermsInField); - if (baseSize >= mincount) { + if (baseSize < processor.effectiveMincount) { + return; + } - final int[] index = this.index; - // tricky: we add more more element than we need because we will reuse this array later - // for ordering term ords before converting to term labels. - final int[] counts = new int[numTermsInField + 1]; + final int[] index = this.index; - // - // If there is prefix, find its start and end term numbers - // - int startTerm = 0; - int endTerm = numTermsInField; // one past the end + boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet; - TermsEnum te = getOrdTermsEnum(searcher.getLeafReader()); - if (te != null && prefix != null && prefix.length() > 0) { - final BytesRefBuilder prefixBr = new BytesRefBuilder(); - prefixBr.copyChars(prefix); - if (te.seekCeil(prefixBr.get()) == TermsEnum.SeekStatus.END) { - startTerm = numTermsInField; + if (doNegative) { + FixedBitSet bs = ((BitDocSet) docs).getBits().clone(); + bs.flip(0, maxDoc); + // TODO: when iterator across negative elements is available, use that + // instead of creating a new bitset and inverting. + docs = new BitDocSet(bs, maxDoc - baseSize); + // simply negating will mean that we have deleted docs in the set. + // that should be OK, as their entries in our table should be empty. + } + + // For the biggest terms, do straight set intersections + for (TopTerm tt : bigTerms.values()) { + // TODO: counts could be deferred if sorting by index order + counts[tt.termNum] = searcher.numDocs(tt.termQuery, docs); + } + + // TODO: we could short-circuit counting altogether for sorted faceting + // where we already have enough terms from the bigTerms + + if (termInstances > 0) { + DocIterator iter = docs.iterator(); + while (iter.hasNext()) { + int doc = iter.nextDoc(); + int code = index[doc]; + + if ((code & 0xff) == 1) { + int pos = code >>> 8; + int whichArray = (doc >>> 16) & 0xff; + byte[] arr = tnums[whichArray]; + int tnum = 0; + for (; ; ) { + int delta = 0; + for (; ; ) { + byte b = arr[pos++]; + delta = (delta << 7) | (b & 0x7f); + if ((b & 0x80) == 0) break; + } + if (delta == 0) break; + tnum += delta - TNUM_OFFSET; + counts[tnum]++; + } } else { - startTerm = (int) te.ord(); - } - prefixBr.append(UnicodeUtil.BIG_TERM); - if (te.seekCeil(prefixBr.get()) == TermsEnum.SeekStatus.END) { - endTerm = numTermsInField; - } else { - endTerm = (int) te.ord(); - } - } - - /*********** - // Alternative 2: get the docSet of the prefix (could take a while) and - // then do the intersection with the baseDocSet first. - if (prefix != null && prefix.length() > 0) { - docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs); - // The issue with this method are problems of returning 0 counts for terms w/o - // the prefix. We can't just filter out those terms later because it may - // mean that we didn't collect enough terms in the queue (in the sorted case). - } - ***********/ - - boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 - && startTerm==0 && endTerm==numTermsInField - && docs instanceof BitDocSet; - - if (doNegative) { - FixedBitSet bs = ((BitDocSet)docs).getBits().clone(); - bs.flip(0, maxDoc); - // TODO: when iterator across negative elements is available, use that - // instead of creating a new bitset and inverting. - docs = new BitDocSet(bs, maxDoc - baseSize); - // simply negating will mean that we have deleted docs in the set. - // that should be OK, as their entries in our table should be empty. - //System.out.println(" NEG"); - } - - // For the biggest terms, do straight set intersections - for (TopTerm tt : bigTerms.values()) { - //System.out.println(" do big termNum=" + tt.termNum + " term=" + tt.term.utf8ToString()); - // TODO: counts could be deferred if sorted==false - if (tt.termNum >= startTerm && tt.termNum < endTerm) { - counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(field, tt.term)), docs); - //System.out.println(" count=" + counts[tt.termNum]); - } else { - //System.out.println("SKIP term=" + tt.termNum); - } - } - - // TODO: we could short-circuit counting altogether for sorted faceting - // where we already have enough terms from the bigTerms - - // TODO: we could shrink the size of the collection array, and - // additionally break when the termNumber got above endTerm, but - // it would require two extra conditionals in the inner loop (although - // they would be predictable for the non-prefix case). - // Perhaps a different copy of the code would be warranted. - - if (termInstances > 0) { - DocIterator iter = docs.iterator(); - while (iter.hasNext()) { - int doc = iter.nextDoc(); - //System.out.println("iter doc=" + doc); - int code = index[doc]; - - if ((code & 0xff)==1) { - //System.out.println(" ptr"); - int pos = code>>>8; - int whichArray = (doc >>> 16) & 0xff; - byte[] arr = tnums[whichArray]; - int tnum = 0; - for(;;) { - int delta = 0; - for(;;) { - byte b = arr[pos++]; - delta = (delta << 7) | (b & 0x7f); - if ((b & 0x80) == 0) break; - } + int tnum = 0; + int delta = 0; + for (; ; ) { + delta = (delta << 7) | (code & 0x7f); + if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; - //System.out.println(" tnum=" + tnum); counts[tnum]++; + delta = 0; } - } else { - //System.out.println(" inlined"); - int tnum = 0; - int delta = 0; - for (;;) { - delta = (delta << 7) | (code & 0x7f); - if ((code & 0x80)==0) { - if (delta==0) break; - tnum += delta - TNUM_OFFSET; - //System.out.println(" tnum=" + tnum); - counts[tnum]++; - delta = 0; - } - code >>>= 8; - } + code >>>= 8; } } } - final CharsRefBuilder charsRef = new CharsRefBuilder(); - - int off=offset; - int lim=limit>=0 ? limit : Integer.MAX_VALUE; - - if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { - int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; - maxsize = Math.min(maxsize, numTermsInField); - LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize,1000), maxsize, Long.MIN_VALUE); - - int min=mincount-1; // the smallest value in the top 'N' values - //System.out.println("START=" + startTerm + " END=" + endTerm); - for (int i=startTerm; imin) { - // NOTE: we use c>min rather than c>=min as an optimization because we are going in - // index order, so we already know that the keys are ordered. This can be very - // important if a lot of the counts are repeated (like zero counts would be). - - // smaller term numbers sort higher, so subtract the term number instead - long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i); - boolean displaced = queue.insert(pair); - if (displaced) min=(int)(queue.top() >>> 32); - } - } - - // now select the right page from the results - - // if we are deep paging, we don't have to order the highest "offset" counts. - int collectCount = Math.max(0, queue.size() - off); - assert collectCount <= lim; - - // the start and end indexes of our list "sorted" (starting with the highest value) - int sortedIdxStart = queue.size() - (collectCount - 1); - int sortedIdxEnd = queue.size() + 1; - final long[] sorted = queue.sort(collectCount); - - final int[] indirect = counts; // reuse the counts array for the index into the tnums array - assert indirect.length >= sortedIdxEnd; - - for (int i=sortedIdxStart; i>> 32); - int tnum = Integer.MAX_VALUE - (int)pair; - - indirect[i] = i; // store the index for indirect sorting - sorted[i] = tnum; // reuse the "sorted" array to store the term numbers for indirect sorting - - // add a null label for now... we'll fill it in later. - res.add(null, c); - } - - // now sort the indexes by the term numbers - PrimUtils.sort(sortedIdxStart, sortedIdxEnd, indirect, new PrimUtils.IntComparator() { - @Override - public int compare(int a, int b) { - return (int)sorted[a] - (int)sorted[b]; - } - - @Override - public boolean lessThan(int a, int b) { - return sorted[a] < sorted[b]; - } - - @Override - public boolean equals(int a, int b) { - return sorted[a] == sorted[b]; - } - }); - - // convert the term numbers to term values and set - // as the label - //System.out.println("sortStart=" + sortedIdxStart + " end=" + sortedIdxEnd); - for (int i=sortedIdxStart; i= startTermIndex && tt.termNum < endTermIndex) { // handle the biggest terms - try ( DocSet intersection = searcher.getDocSet(new TermQuery(new Term(field, tt.term)), docs); ) + try ( DocSet intersection = searcher.getDocSet(tt.termQuery, docs); ) { int collected = processor.collect(intersection, tt.termNum - startTermIndex); processor.countAcc.incrementCount(tt.termNum - startTermIndex, collected); + if (processor.allBucketsSlot >= 0) { + processor.collect(intersection, processor.allBucketsSlot); + processor.countAcc.incrementCount(processor.allBucketsSlot, collected); + } if (collected > 0) { uniqueTerms++; } diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index 0df9a48f2a9..1c0350aacc4 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -27,6 +27,8 @@ import java.util.Random; import com.tdunning.math.stats.AVLTreeDigest; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; import org.apache.solr.JSONTestUtil; import org.apache.solr.SolrTestCaseHS; import org.apache.solr.common.SolrInputDocument; @@ -619,6 +621,14 @@ public class TestJsonFacets extends SolrTestCaseHS { " }" ); + // test allBucket multi-valued + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{x:{terms:{field:'${multi_ss}',allBuckets:true}}}" + ) + , "facets=={ count:6, " + + "x:{ buckets:[{val:a, count:3}, {val:b, count:3}] , allBuckets:{count:6} } }" + ); + ////////////////////////////////////////////////////////////////////////////////////////////////////////// // test converting legacy facets @@ -697,8 +707,8 @@ public class TestJsonFacets extends SolrTestCaseHS { doStats( client, params() ); } - /*** - public void testPercentiles() { + + public void XtestPercentiles() { AVLTreeDigest catA = new AVLTreeDigest(100); catA.add(4); catA.add(2); @@ -728,11 +738,10 @@ public class TestJsonFacets extends SolrTestCaseHS { } return sb.toString(); } - ***/ - /*** test code to ensure TDigest is working as we expect. - @Test - public void testTDigest() throws Exception { + /*** test code to ensure TDigest is working as we expect. */ + + public void XtestTDigest() throws Exception { AVLTreeDigest t1 = new AVLTreeDigest(100); t1.add(10, 1); t1.add(90, 1); @@ -779,5 +788,4 @@ public class TestJsonFacets extends SolrTestCaseHS { System.out.println(top.quantile(0.5)); System.out.println(top.quantile(0.9)); } - ******/ }