SOLR-10634: calc metrics in first phase if limit=-1 and no subfacets

2017-05-23 20:52:46 -04:00 · 2017-05-23 20:52:46 -04:00 · d60c72f34c
parent ea9adc055b
commit d60c72f34c
5 changed files with 366 additions and 11 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -231,6 +231,14 @@ Bug Fixes
  resulting in exceptions when using a hashing faceting method and sorting by hll(numeric_field).
  (yonik)

+Optimizations
+----------------------
+* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)
+  and there are no nested facets, aggregations are computed in the first collection phase
+  so that the second phase which would normally involve calculating the domain for the bucket
+  can be skipped entirely, leading to large performance improvements. (yonik)
+
+
 Other Changes
 ----------------------

--- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
+++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
@ -129,7 +129,36 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
      }
      sortAcc = indexOrderAcc;
      deferredAggs = freq.getFacetStats();
-    } else {
+    }
+
+    // If we are going to return all buckets and if there are no subfacets (that would need a domain), then don't defer
+    // any aggregation calculations to a second phase.  This way we can avoid calculating domains for each bucket, which
+    // can be expensive.
+    if (freq.limit == -1 && freq.subFacets.size() == 0) {
+      accs = new SlotAcc[ freq.getFacetStats().size() ];
+      int otherAccIdx = 0;
+      for (Map.Entry<String,AggValueSource> entry : freq.getFacetStats().entrySet()) {
+        AggValueSource agg = entry.getValue();
+        SlotAcc acc = agg.createSlotAcc(fcontext, numDocs, numSlots);
+        acc.key = entry.getKey();
+        accMap.put(acc.key, acc);
+        accs[otherAccIdx++] = acc;
+      }
+      if (accs.length == 1) {
+        collectAcc = accs[0];
+      } else {
+        collectAcc = new MultiAcc(fcontext, accs);
+      }
+
+      if (sortAcc == null) {
+        sortAcc = accMap.get(freq.sortVariable);
+        assert sortAcc != null;
+      }
+
+      deferredAggs = null;
+    }
+
+    if (sortAcc == null) {
      AggValueSource sortAgg = freq.getFacetStats().get(freq.sortVariable);
      if (sortAgg != null) {
        collectAcc = sortAgg.createSlotAcc(fcontext, numDocs, numSlots);
@ -140,7 +169,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
      deferredAggs.remove(freq.sortVariable);
    }

-    if (deferredAggs.size() == 0) {
+    if (deferredAggs == null || deferredAggs.size() == 0) {
      deferredAggs = null;
    }

@ -438,6 +467,54 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
    }
  }

+  static class MultiAcc extends SlotAcc {
+    final SlotAcc[] subAccs;
+
+    MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) {
+      super(fcontext);
+      this.subAccs = subAccs;
+    }
+
+    @Override
+    public void collect(int doc, int slot) throws IOException {
+      for (SlotAcc acc : subAccs) {
+        acc.collect(doc, slot);
+      }
+    }
+
+    @Override
+    public int compare(int slotA, int slotB) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Object getValue(int slotNum) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void reset() throws IOException {
+      for (SlotAcc acc : subAccs) {
+        acc.reset();
+      }
+    }
+
+    @Override
+    public void resize(Resizer resizer) {
+      for (SlotAcc acc : subAccs) {
+        acc.resize(resizer);
+      }
+    }
+
+    @Override
+    public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
+      for (SlotAcc acc : subAccs) {
+        acc.setValues(bucket, slotNum);
+      }
+    }
+  }
+
+
  static class SpecialSlotAcc extends SlotAcc {
    SlotAcc collectAcc;
    SlotAcc[] otherAccs;
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-tlog.xml
@ -164,4 +164,12 @@
    </lst>
  </initParams>

+
+  <valueSourceParser name="nvl" class="org.apache.solr.search.function.NvlValueSourceParser">
+    <float name="nvlFloatValue">0.0</float>
+  </valueSourceParser>
+  <valueSourceParser name="agg_debug" class="org.apache.solr.search.facet.DebugAgg$Parser">
+  </valueSourceParser>
+
+
 </config>
--- a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java
@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search.facet;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SimpleOrderedMap;
+import org.apache.solr.search.DocSet;
+import org.apache.solr.search.FunctionQParser;
+import org.apache.solr.search.SyntaxError;
+import org.apache.solr.search.ValueSourceParser;
+
+
+public class DebugAgg extends AggValueSource {
+
+  public static class Parser extends ValueSourceParser {
+    @Override
+    public ValueSource parse(FunctionQParser fp) throws SyntaxError {
+      return new DebugAgg();
+    }
+
+    @Override
+    public void init(NamedList args) {
+    }
+  }
+
+
+  public DebugAgg() {
+    super("debug");
+  }
+
+  public DebugAgg(String name) {
+    super(name);
+  }
+
+  @Override
+  public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) {
+    return new Acc(fcontext, numDocs, numSlots);
+  }
+
+  @Override
+  public int hashCode() {
+    return 0;
+  }
+
+  @Override
+  public String description() {
+    return "debug()";
+  }
+
+  public static class Acc extends SlotAcc {
+    public static AtomicLong creates = new AtomicLong(0);
+    public static AtomicLong resets = new AtomicLong(0);
+    public static AtomicLong resizes = new AtomicLong(0);
+    public static Acc last;
+
+    public CountSlotAcc sub;
+    public int numDocs;
+    public int numSlots;
+
+    public Acc(FacetContext fcontext, int numDocs, int numSlots) {
+      super(fcontext);
+      this.last = this;
+      this.numDocs = numDocs;
+      this.numSlots = numSlots;
+      creates.addAndGet(1);
+      sub = new CountSlotArrAcc(fcontext, numSlots);
+
+      new RuntimeException("DEBUG Acc numSlots=" + numSlots).printStackTrace();
+    }
+
+    @Override
+    public void collect(int doc, int slot) throws IOException {
+      sub.collect(doc, slot);
+    }
+
+    @Override
+    public int compare(int slotA, int slotB) {
+      return sub.compare(slotA, slotB);
+    }
+
+    @Override
+    public Object getValue(int slotNum) throws IOException {
+      return sub.getValue(slotNum);
+    }
+
+    @Override
+    public void reset() throws IOException {
+      resets.addAndGet(1);
+      sub.reset();
+    }
+
+    @Override
+    public void resize(Resizer resizer) {
+      resizes.addAndGet(1);
+      this.numSlots = resizer.getNewSize();
+      sub.resize(resizer);
+    }
+
+    @Override
+    public void setNextReader(LeafReaderContext readerContext) throws IOException {
+      sub.setNextReader(readerContext);
+    }
+
+    @Override
+    public int collect(DocSet docs, int slot) throws IOException {
+      return sub.collect(docs, slot);
+    }
+
+    @Override
+    public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
+      sub.key = this.key;  // TODO: Blech... this should be fixed
+      sub.setValues(bucket, slotNum);
+    }
+
+    @Override
+    public void close() throws IOException {
+      sub.close();
+    }
+  }
+
+  @Override
+  public FacetMerger createFacetMerger(Object prototype) {
+    return new FacetLongMerger();
+  }
+
+}
--- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
+++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java
@ -471,7 +471,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
    doStatsTemplated(client, params(p, "facet","true",       "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") );

    // multi-valued strings, method=dv for terms facets
-    doStatsTemplated(client, params(p, "terms", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));
+    doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));

    // single valued docvalues for strings, and single valued numeric doc values for numeric fields
    doStatsTemplated(client, params(p,                "rows","0", "noexist","noexist_sd",  "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd"    ,"multi_ss","multi_sds") );
@ -491,6 +491,26 @@ public class TestJsonFacets extends SolrTestCaseHS {
    if (p.get("num_is") == null) p.add("num_is","num_is");
    if (p.get("num_fs") == null) p.add("num_fs","num_fs");

+    String terms = p.get("terms");
+    if (terms == null) terms="";
+    int limit=0;
+    switch (random().nextInt(4)) {
+      case 0: limit=-1;
+      case 1: limit=1000000;
+      case 2: // fallthrough
+      case 3: // fallthrough
+    }
+    if (limit != 0) {
+      terms=terms+"limit:"+limit+",";
+    }
+    String terms_method = p.get("terms_method");
+    if (terms_method != null) {
+      terms=terms+terms_method;
+    }
+    p.set("terms", terms);
+    // "${terms}" should be put at the beginning of generic terms facets.
+    // It may specify "method=..." or "limit:-1", so should not be used if the facet explicitly specifies.
+
    MacroExpander m = new MacroExpander( p.getMap() );

    String cat_s = m.expand("${cat_s}");
@ -862,7 +882,7 @@ public class TestJsonFacets extends SolrTestCaseHS {

    // test numBuckets
    client.testJQ(params(p, "q", "*:*", "rows", "0", "facet", "true"
-            , "json.facet", "{f1:{terms:{${terms} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error
+            , "json.facet", "{f1:{terms:{${terms_method} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error
        )
        , "facets=={ 'count':6, " +
            "'f1':{ numBuckets:2, buckets:[{val:B, count:3}]} } "
@ -1048,10 +1068,10 @@ public class TestJsonFacets extends SolrTestCaseHS {
    // also test limit:0
    client.testJQ(params(p, "q", "*:*"
            , "json.facet", "{" +
-                " f0:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " +
-                ",f1:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " +  // offset with 0 limit
-                ",f2:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
-                ",f3:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " +
+                " f0:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " +
+                ",f1:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " +  // offset with 0 limit
+                ",f2:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
+                ",f3:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " +
                "}"
        )
        , "facets=={ 'count':6, " +
@ -1066,9 +1086,9 @@ public class TestJsonFacets extends SolrTestCaseHS {
    // also test limit:0
    client.testJQ(params(p, "q", "*:*"
            , "json.facet", "{" +
-                " f0:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0} " +
-                ",f1:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " +  // offset with 0 limit
-                ",f2:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
+                " f0:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0} " +
+                ",f1:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " +  // offset with 0 limit
+                ",f2:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
                "}"
        )
        , "facets=={ 'count':6, " +
@ -1398,6 +1418,102 @@ public class TestJsonFacets extends SolrTestCaseHS {

    }

+
+    ////////////////////////////////////////////////////////////////
+    // test which phase stats are calculated in
+    ////////////////////////////////////////////////////////////////
+    if (client.local()) {
+      long creates, resets;
+      // NOTE: these test the current implementation and may need to be adjusted to match future optimizations (such as calculating N buckets in parallel in the second phase)
+
+      creates = DebugAgg.Acc.creates.get();
+      resets = DebugAgg.Acc.resets.get();
+      client.testJQ(params(p, "q", "*:*"
+          , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{x:'debug()'}   }}}"  // x should be deferred to 2nd phase
+          )
+          , "facets=={ 'count':6, " +
+              "f1:{  buckets:[{ val:batman, count:1, x:1}]} } "
+      );
+
+      assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+      assertTrue( DebugAgg.Acc.resets.get() - resets <= 1);
+      assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably "1", but may be special slot for something.  As long as it's not cardinality of the field
+
+
+      creates = DebugAgg.Acc.creates.get();
+      resets = DebugAgg.Acc.resets.get();
+      client.testJQ(params(p, "q", "*:*"
+          , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{ x:'debug()'} , sort:'x asc'  }}}"  // sorting by x... must be done all at once in first phase
+          )
+          , "facets=={ 'count':6, " +
+              "f1:{  buckets:[{ val:batman, count:1, x:1}]}" +
+              " } "
+      );
+
+      assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+      assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
+      assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
+
+
+      // When limit:-1, we should do most stats in first phase (SOLR-10634)
+      creates = DebugAgg.Acc.creates.get();
+      resets = DebugAgg.Acc.resets.get();
+      client.testJQ(params(p, "q", "*:*"
+          , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{x:'debug()'}  }}}"
+          )
+          , "facets=="
+      );
+
+      assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+      assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
+      assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
+
+      // Now for a numeric field
+      // When limit:-1, we should do most stats in first phase (SOLR-10634)
+      creates = DebugAgg.Acc.creates.get();
+      resets = DebugAgg.Acc.resets.get();
+      client.testJQ(params(p, "q", "*:*"
+          , "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{x:'debug()'}  }}}"
+          )
+          , "facets=="
+      );
+
+      assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+      assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
+      assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
+
+
+      // But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets)
+      creates = DebugAgg.Acc.creates.get();
+      resets = DebugAgg.Acc.resets.get();
+      client.testJQ(params(p, "q", "*:*"
+          , "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}}   }  }}}"
+          )
+          , "facets=="
+      );
+
+      assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+      assertTrue( DebugAgg.Acc.resets.get() - resets >=4);
+      assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher
+
+      // Now with a numeric field
+      // But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets)
+      creates = DebugAgg.Acc.creates.get();
+      resets = DebugAgg.Acc.resets.get();
+      client.testJQ(params(p, "q", "*:*"
+          , "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}}   }  }}}"
+          )
+          , "facets=="
+      );
+
+      assertEquals(1, DebugAgg.Acc.creates.get() - creates);
+      assertTrue( DebugAgg.Acc.resets.get() - resets >=4);
+      assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher
+    }
+    //////////////////////////////////////////////////////////////// end phase testing
+
+
+
  }

  @Test