From d60c72f34ca9c63ac6075e00dac844c6f052d0a8 Mon Sep 17 00:00:00 2001 From: yonik Date: Tue, 23 May 2017 20:52:46 -0400 Subject: [PATCH] SOLR-10634: calc metrics in first phase if limit=-1 and no subfacets --- solr/CHANGES.txt | 8 + .../search/facet/FacetFieldProcessor.java | 81 +++++++++- .../solr/collection1/conf/solrconfig-tlog.xml | 8 + .../apache/solr/search/facet/DebugAgg.java | 146 ++++++++++++++++++ .../solr/search/facet/TestJsonFacets.java | 134 ++++++++++++++-- 5 files changed, 366 insertions(+), 11 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 928a190c0b0..5b92e3c4996 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -231,6 +231,14 @@ Bug Fixes resulting in exceptions when using a hashing faceting method and sorting by hll(numeric_field). (yonik) +Optimizations +---------------------- +* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1) + and there are no nested facets, aggregations are computed in the first collection phase + so that the second phase which would normally involve calculating the domain for the bucket + can be skipped entirely, leading to large performance improvements. (yonik) + + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java index 143d1fce026..7b08e14ac3a 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java @@ -129,7 +129,36 @@ abstract class FacetFieldProcessor extends FacetProcessor { } sortAcc = indexOrderAcc; deferredAggs = freq.getFacetStats(); - } else { + } + + // If we are going to return all buckets and if there are no subfacets (that would need a domain), then don't defer + // any aggregation calculations to a second phase. This way we can avoid calculating domains for each bucket, which + // can be expensive. + if (freq.limit == -1 && freq.subFacets.size() == 0) { + accs = new SlotAcc[ freq.getFacetStats().size() ]; + int otherAccIdx = 0; + for (Map.Entry entry : freq.getFacetStats().entrySet()) { + AggValueSource agg = entry.getValue(); + SlotAcc acc = agg.createSlotAcc(fcontext, numDocs, numSlots); + acc.key = entry.getKey(); + accMap.put(acc.key, acc); + accs[otherAccIdx++] = acc; + } + if (accs.length == 1) { + collectAcc = accs[0]; + } else { + collectAcc = new MultiAcc(fcontext, accs); + } + + if (sortAcc == null) { + sortAcc = accMap.get(freq.sortVariable); + assert sortAcc != null; + } + + deferredAggs = null; + } + + if (sortAcc == null) { AggValueSource sortAgg = freq.getFacetStats().get(freq.sortVariable); if (sortAgg != null) { collectAcc = sortAgg.createSlotAcc(fcontext, numDocs, numSlots); @@ -140,7 +169,7 @@ abstract class FacetFieldProcessor extends FacetProcessor { deferredAggs.remove(freq.sortVariable); } - if (deferredAggs.size() == 0) { + if (deferredAggs == null || deferredAggs.size() == 0) { deferredAggs = null; } @@ -438,6 +467,54 @@ abstract class FacetFieldProcessor extends FacetProcessor { } } + static class MultiAcc extends SlotAcc { + final SlotAcc[] subAccs; + + MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) { + super(fcontext); + this.subAccs = subAccs; + } + + @Override + public void collect(int doc, int slot) throws IOException { + for (SlotAcc acc : subAccs) { + acc.collect(doc, slot); + } + } + + @Override + public int compare(int slotA, int slotB) { + throw new UnsupportedOperationException(); + } + + @Override + public Object getValue(int slotNum) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void reset() throws IOException { + for (SlotAcc acc : subAccs) { + acc.reset(); + } + } + + @Override + public void resize(Resizer resizer) { + for (SlotAcc acc : subAccs) { + acc.resize(resizer); + } + } + + @Override + public void setValues(SimpleOrderedMap bucket, int slotNum) throws IOException { + for (SlotAcc acc : subAccs) { + acc.setValues(bucket, slotNum); + } + } + } + + static class SpecialSlotAcc extends SlotAcc { SlotAcc collectAcc; SlotAcc[] otherAccs; diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml index b08af838349..cf12c4a19d8 100644 --- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml @@ -164,4 +164,12 @@ + + + 0.0 + + + + + diff --git a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java new file mode 100644 index 00000000000..ad4fabf6f19 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search.facet; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.FunctionQParser; +import org.apache.solr.search.SyntaxError; +import org.apache.solr.search.ValueSourceParser; + + +public class DebugAgg extends AggValueSource { + + public static class Parser extends ValueSourceParser { + @Override + public ValueSource parse(FunctionQParser fp) throws SyntaxError { + return new DebugAgg(); + } + + @Override + public void init(NamedList args) { + } + } + + + public DebugAgg() { + super("debug"); + } + + public DebugAgg(String name) { + super(name); + } + + @Override + public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) { + return new Acc(fcontext, numDocs, numSlots); + } + + @Override + public int hashCode() { + return 0; + } + + @Override + public String description() { + return "debug()"; + } + + public static class Acc extends SlotAcc { + public static AtomicLong creates = new AtomicLong(0); + public static AtomicLong resets = new AtomicLong(0); + public static AtomicLong resizes = new AtomicLong(0); + public static Acc last; + + public CountSlotAcc sub; + public int numDocs; + public int numSlots; + + public Acc(FacetContext fcontext, int numDocs, int numSlots) { + super(fcontext); + this.last = this; + this.numDocs = numDocs; + this.numSlots = numSlots; + creates.addAndGet(1); + sub = new CountSlotArrAcc(fcontext, numSlots); + + new RuntimeException("DEBUG Acc numSlots=" + numSlots).printStackTrace(); + } + + @Override + public void collect(int doc, int slot) throws IOException { + sub.collect(doc, slot); + } + + @Override + public int compare(int slotA, int slotB) { + return sub.compare(slotA, slotB); + } + + @Override + public Object getValue(int slotNum) throws IOException { + return sub.getValue(slotNum); + } + + @Override + public void reset() throws IOException { + resets.addAndGet(1); + sub.reset(); + } + + @Override + public void resize(Resizer resizer) { + resizes.addAndGet(1); + this.numSlots = resizer.getNewSize(); + sub.resize(resizer); + } + + @Override + public void setNextReader(LeafReaderContext readerContext) throws IOException { + sub.setNextReader(readerContext); + } + + @Override + public int collect(DocSet docs, int slot) throws IOException { + return sub.collect(docs, slot); + } + + @Override + public void setValues(SimpleOrderedMap bucket, int slotNum) throws IOException { + sub.key = this.key; // TODO: Blech... this should be fixed + sub.setValues(bucket, slotNum); + } + + @Override + public void close() throws IOException { + sub.close(); + } + } + + @Override + public FacetMerger createFacetMerger(Object prototype) { + return new FacetLongMerger(); + } + +} \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index 35904267eb3..03cc480b1b0 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -471,7 +471,7 @@ public class TestJsonFacets extends SolrTestCaseHS { doStatsTemplated(client, params(p, "facet","true", "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") ); // multi-valued strings, method=dv for terms facets - doStatsTemplated(client, params(p, "terms", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss")); + doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss")); // single valued docvalues for strings, and single valued numeric doc values for numeric fields doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sd", "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd" ,"multi_ss","multi_sds") ); @@ -491,6 +491,26 @@ public class TestJsonFacets extends SolrTestCaseHS { if (p.get("num_is") == null) p.add("num_is","num_is"); if (p.get("num_fs") == null) p.add("num_fs","num_fs"); + String terms = p.get("terms"); + if (terms == null) terms=""; + int limit=0; + switch (random().nextInt(4)) { + case 0: limit=-1; + case 1: limit=1000000; + case 2: // fallthrough + case 3: // fallthrough + } + if (limit != 0) { + terms=terms+"limit:"+limit+","; + } + String terms_method = p.get("terms_method"); + if (terms_method != null) { + terms=terms+terms_method; + } + p.set("terms", terms); + // "${terms}" should be put at the beginning of generic terms facets. + // It may specify "method=..." or "limit:-1", so should not be used if the facet explicitly specifies. + MacroExpander m = new MacroExpander( p.getMap() ); String cat_s = m.expand("${cat_s}"); @@ -862,7 +882,7 @@ public class TestJsonFacets extends SolrTestCaseHS { // test numBuckets client.testJQ(params(p, "q", "*:*", "rows", "0", "facet", "true" - , "json.facet", "{f1:{terms:{${terms} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error + , "json.facet", "{f1:{terms:{${terms_method} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error ) , "facets=={ 'count':6, " + "'f1':{ numBuckets:2, buckets:[{val:B, count:3}]} } " @@ -1048,10 +1068,10 @@ public class TestJsonFacets extends SolrTestCaseHS { // also test limit:0 client.testJQ(params(p, "q", "*:*" , "json.facet", "{" + - " f0:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " + - ",f1:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit - ",f2:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " + - ",f3:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " + + " f0:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " + + ",f1:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit + ",f2:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " + + ",f3:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " + "}" ) , "facets=={ 'count':6, " + @@ -1066,9 +1086,9 @@ public class TestJsonFacets extends SolrTestCaseHS { // also test limit:0 client.testJQ(params(p, "q", "*:*" , "json.facet", "{" + - " f0:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0} " + - ",f1:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit - ",f2:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " + + " f0:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0} " + + ",f1:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit + ",f2:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " + "}" ) , "facets=={ 'count':6, " + @@ -1398,6 +1418,102 @@ public class TestJsonFacets extends SolrTestCaseHS { } + + //////////////////////////////////////////////////////////////// + // test which phase stats are calculated in + //////////////////////////////////////////////////////////////// + if (client.local()) { + long creates, resets; + // NOTE: these test the current implementation and may need to be adjusted to match future optimizations (such as calculating N buckets in parallel in the second phase) + + creates = DebugAgg.Acc.creates.get(); + resets = DebugAgg.Acc.resets.get(); + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{x:'debug()'} }}}" // x should be deferred to 2nd phase + ) + , "facets=={ 'count':6, " + + "f1:{ buckets:[{ val:batman, count:1, x:1}]} } " + ); + + assertEquals(1, DebugAgg.Acc.creates.get() - creates); + assertTrue( DebugAgg.Acc.resets.get() - resets <= 1); + assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably "1", but may be special slot for something. As long as it's not cardinality of the field + + + creates = DebugAgg.Acc.creates.get(); + resets = DebugAgg.Acc.resets.get(); + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{ x:'debug()'} , sort:'x asc' }}}" // sorting by x... must be done all at once in first phase + ) + , "facets=={ 'count':6, " + + "f1:{ buckets:[{ val:batman, count:1, x:1}]}" + + " } " + ); + + assertEquals(1, DebugAgg.Acc.creates.get() - creates); + assertTrue( DebugAgg.Acc.resets.get() - resets == 0); + assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing. + + + // When limit:-1, we should do most stats in first phase (SOLR-10634) + creates = DebugAgg.Acc.creates.get(); + resets = DebugAgg.Acc.resets.get(); + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{x:'debug()'} }}}" + ) + , "facets==" + ); + + assertEquals(1, DebugAgg.Acc.creates.get() - creates); + assertTrue( DebugAgg.Acc.resets.get() - resets == 0); + assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing. + + // Now for a numeric field + // When limit:-1, we should do most stats in first phase (SOLR-10634) + creates = DebugAgg.Acc.creates.get(); + resets = DebugAgg.Acc.resets.get(); + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{x:'debug()'} }}}" + ) + , "facets==" + ); + + assertEquals(1, DebugAgg.Acc.creates.get() - creates); + assertTrue( DebugAgg.Acc.resets.get() - resets == 0); + assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing. + + + // But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets) + creates = DebugAgg.Acc.creates.get(); + resets = DebugAgg.Acc.resets.get(); + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}} } }}}" + ) + , "facets==" + ); + + assertEquals(1, DebugAgg.Acc.creates.get() - creates); + assertTrue( DebugAgg.Acc.resets.get() - resets >=4); + assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher + + // Now with a numeric field + // But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets) + creates = DebugAgg.Acc.creates.get(); + resets = DebugAgg.Acc.resets.get(); + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}} } }}}" + ) + , "facets==" + ); + + assertEquals(1, DebugAgg.Acc.creates.get() - creates); + assertTrue( DebugAgg.Acc.resets.get() - resets >=4); + assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher + } + //////////////////////////////////////////////////////////////// end phase testing + + + } @Test