SOLR-10634: calc metrics in first phase if limit=-1 and no subfacets

This commit is contained in:
yonik 2017-05-23 20:52:46 -04:00
parent ea9adc055b
commit d60c72f34c
5 changed files with 366 additions and 11 deletions

View File

@ -231,6 +231,14 @@ Bug Fixes
resulting in exceptions when using a hashing faceting method and sorting by hll(numeric_field).
(yonik)
Optimizations
----------------------
* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)
and there are no nested facets, aggregations are computed in the first collection phase
so that the second phase which would normally involve calculating the domain for the bucket
can be skipped entirely, leading to large performance improvements. (yonik)
Other Changes
----------------------

View File

@ -129,7 +129,36 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
sortAcc = indexOrderAcc;
deferredAggs = freq.getFacetStats();
} else {
}
// If we are going to return all buckets and if there are no subfacets (that would need a domain), then don't defer
// any aggregation calculations to a second phase. This way we can avoid calculating domains for each bucket, which
// can be expensive.
if (freq.limit == -1 && freq.subFacets.size() == 0) {
accs = new SlotAcc[ freq.getFacetStats().size() ];
int otherAccIdx = 0;
for (Map.Entry<String,AggValueSource> entry : freq.getFacetStats().entrySet()) {
AggValueSource agg = entry.getValue();
SlotAcc acc = agg.createSlotAcc(fcontext, numDocs, numSlots);
acc.key = entry.getKey();
accMap.put(acc.key, acc);
accs[otherAccIdx++] = acc;
}
if (accs.length == 1) {
collectAcc = accs[0];
} else {
collectAcc = new MultiAcc(fcontext, accs);
}
if (sortAcc == null) {
sortAcc = accMap.get(freq.sortVariable);
assert sortAcc != null;
}
deferredAggs = null;
}
if (sortAcc == null) {
AggValueSource sortAgg = freq.getFacetStats().get(freq.sortVariable);
if (sortAgg != null) {
collectAcc = sortAgg.createSlotAcc(fcontext, numDocs, numSlots);
@ -140,7 +169,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
deferredAggs.remove(freq.sortVariable);
}
if (deferredAggs.size() == 0) {
if (deferredAggs == null || deferredAggs.size() == 0) {
deferredAggs = null;
}
@ -438,6 +467,54 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
}
static class MultiAcc extends SlotAcc {
final SlotAcc[] subAccs;
MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) {
super(fcontext);
this.subAccs = subAccs;
}
@Override
public void collect(int doc, int slot) throws IOException {
for (SlotAcc acc : subAccs) {
acc.collect(doc, slot);
}
}
@Override
public int compare(int slotA, int slotB) {
throw new UnsupportedOperationException();
}
@Override
public Object getValue(int slotNum) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void reset() throws IOException {
for (SlotAcc acc : subAccs) {
acc.reset();
}
}
@Override
public void resize(Resizer resizer) {
for (SlotAcc acc : subAccs) {
acc.resize(resizer);
}
}
@Override
public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
for (SlotAcc acc : subAccs) {
acc.setValues(bucket, slotNum);
}
}
}
static class SpecialSlotAcc extends SlotAcc {
SlotAcc collectAcc;
SlotAcc[] otherAccs;

View File

@ -164,4 +164,12 @@
</lst>
</initParams>
<valueSourceParser name="nvl" class="org.apache.solr.search.function.NvlValueSourceParser">
<float name="nvlFloatValue">0.0</float>
</valueSourceParser>
<valueSourceParser name="agg_debug" class="org.apache.solr.search.facet.DebugAgg$Parser">
</valueSourceParser>
</config>

View File

@ -0,0 +1,146 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.FunctionQParser;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.search.ValueSourceParser;
public class DebugAgg extends AggValueSource {
public static class Parser extends ValueSourceParser {
@Override
public ValueSource parse(FunctionQParser fp) throws SyntaxError {
return new DebugAgg();
}
@Override
public void init(NamedList args) {
}
}
public DebugAgg() {
super("debug");
}
public DebugAgg(String name) {
super(name);
}
@Override
public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) {
return new Acc(fcontext, numDocs, numSlots);
}
@Override
public int hashCode() {
return 0;
}
@Override
public String description() {
return "debug()";
}
public static class Acc extends SlotAcc {
public static AtomicLong creates = new AtomicLong(0);
public static AtomicLong resets = new AtomicLong(0);
public static AtomicLong resizes = new AtomicLong(0);
public static Acc last;
public CountSlotAcc sub;
public int numDocs;
public int numSlots;
public Acc(FacetContext fcontext, int numDocs, int numSlots) {
super(fcontext);
this.last = this;
this.numDocs = numDocs;
this.numSlots = numSlots;
creates.addAndGet(1);
sub = new CountSlotArrAcc(fcontext, numSlots);
new RuntimeException("DEBUG Acc numSlots=" + numSlots).printStackTrace();
}
@Override
public void collect(int doc, int slot) throws IOException {
sub.collect(doc, slot);
}
@Override
public int compare(int slotA, int slotB) {
return sub.compare(slotA, slotB);
}
@Override
public Object getValue(int slotNum) throws IOException {
return sub.getValue(slotNum);
}
@Override
public void reset() throws IOException {
resets.addAndGet(1);
sub.reset();
}
@Override
public void resize(Resizer resizer) {
resizes.addAndGet(1);
this.numSlots = resizer.getNewSize();
sub.resize(resizer);
}
@Override
public void setNextReader(LeafReaderContext readerContext) throws IOException {
sub.setNextReader(readerContext);
}
@Override
public int collect(DocSet docs, int slot) throws IOException {
return sub.collect(docs, slot);
}
@Override
public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
sub.key = this.key; // TODO: Blech... this should be fixed
sub.setValues(bucket, slotNum);
}
@Override
public void close() throws IOException {
sub.close();
}
}
@Override
public FacetMerger createFacetMerger(Object prototype) {
return new FacetLongMerger();
}
}

View File

@ -471,7 +471,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
doStatsTemplated(client, params(p, "facet","true", "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") );
// multi-valued strings, method=dv for terms facets
doStatsTemplated(client, params(p, "terms", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));
doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));
// single valued docvalues for strings, and single valued numeric doc values for numeric fields
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sd", "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd" ,"multi_ss","multi_sds") );
@ -491,6 +491,26 @@ public class TestJsonFacets extends SolrTestCaseHS {
if (p.get("num_is") == null) p.add("num_is","num_is");
if (p.get("num_fs") == null) p.add("num_fs","num_fs");
String terms = p.get("terms");
if (terms == null) terms="";
int limit=0;
switch (random().nextInt(4)) {
case 0: limit=-1;
case 1: limit=1000000;
case 2: // fallthrough
case 3: // fallthrough
}
if (limit != 0) {
terms=terms+"limit:"+limit+",";
}
String terms_method = p.get("terms_method");
if (terms_method != null) {
terms=terms+terms_method;
}
p.set("terms", terms);
// "${terms}" should be put at the beginning of generic terms facets.
// It may specify "method=..." or "limit:-1", so should not be used if the facet explicitly specifies.
MacroExpander m = new MacroExpander( p.getMap() );
String cat_s = m.expand("${cat_s}");
@ -862,7 +882,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
// test numBuckets
client.testJQ(params(p, "q", "*:*", "rows", "0", "facet", "true"
, "json.facet", "{f1:{terms:{${terms} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error
, "json.facet", "{f1:{terms:{${terms_method} field:${cat_s}, numBuckets:true, limit:1}}}" // TODO: limit:0 produced an error
)
, "facets=={ 'count':6, " +
"'f1':{ numBuckets:2, buckets:[{val:B, count:3}]} } "
@ -1048,10 +1068,10 @@ public class TestJsonFacets extends SolrTestCaseHS {
// also test limit:0
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{" +
" f0:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " +
",f1:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
",f2:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
",f3:{${terms} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " +
" f0:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0} " +
",f1:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
",f2:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
",f3:{${terms_method} type:terms, field:${multi_ss}, allBuckets:true, limit:0, missing:true, facet:{x:'sum(${num_d})', y:'avg(${num_d})'}, sort:'x desc' } " +
"}"
)
, "facets=={ 'count':6, " +
@ -1066,9 +1086,9 @@ public class TestJsonFacets extends SolrTestCaseHS {
// also test limit:0
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{" +
" f0:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0} " +
",f1:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
",f2:{${terms} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
" f0:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0} " +
",f1:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, offset:1} " + // offset with 0 limit
",f2:{${terms_method} type:terms, field:${num_i}, allBuckets:true, limit:0, facet:{x:'sum(${num_d})'}, sort:'x desc' } " +
"}"
)
, "facets=={ 'count':6, " +
@ -1398,6 +1418,102 @@ public class TestJsonFacets extends SolrTestCaseHS {
}
////////////////////////////////////////////////////////////////
// test which phase stats are calculated in
////////////////////////////////////////////////////////////////
if (client.local()) {
long creates, resets;
// NOTE: these test the current implementation and may need to be adjusted to match future optimizations (such as calculating N buckets in parallel in the second phase)
creates = DebugAgg.Acc.creates.get();
resets = DebugAgg.Acc.resets.get();
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{x:'debug()'} }}}" // x should be deferred to 2nd phase
)
, "facets=={ 'count':6, " +
"f1:{ buckets:[{ val:batman, count:1, x:1}]} } "
);
assertEquals(1, DebugAgg.Acc.creates.get() - creates);
assertTrue( DebugAgg.Acc.resets.get() - resets <= 1);
assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably "1", but may be special slot for something. As long as it's not cardinality of the field
creates = DebugAgg.Acc.creates.get();
resets = DebugAgg.Acc.resets.get();
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:1, facet:{ x:'debug()'} , sort:'x asc' }}}" // sorting by x... must be done all at once in first phase
)
, "facets=={ 'count':6, " +
"f1:{ buckets:[{ val:batman, count:1, x:1}]}" +
" } "
);
assertEquals(1, DebugAgg.Acc.creates.get() - creates);
assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
// When limit:-1, we should do most stats in first phase (SOLR-10634)
creates = DebugAgg.Acc.creates.get();
resets = DebugAgg.Acc.resets.get();
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{x:'debug()'} }}}"
)
, "facets=="
);
assertEquals(1, DebugAgg.Acc.creates.get() - creates);
assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
// Now for a numeric field
// When limit:-1, we should do most stats in first phase (SOLR-10634)
creates = DebugAgg.Acc.creates.get();
resets = DebugAgg.Acc.resets.get();
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{x:'debug()'} }}}"
)
, "facets=="
);
assertEquals(1, DebugAgg.Acc.creates.get() - creates);
assertTrue( DebugAgg.Acc.resets.get() - resets == 0);
assertTrue( DebugAgg.Acc.last.numSlots >= 5 ); // all slots should be done in a single shot. there may be more than 5 due to special slots or hashing.
// But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets)
creates = DebugAgg.Acc.creates.get();
resets = DebugAgg.Acc.resets.get();
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{terms:{${terms_method} field:${super_s}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}} } }}}"
)
, "facets=="
);
assertEquals(1, DebugAgg.Acc.creates.get() - creates);
assertTrue( DebugAgg.Acc.resets.get() - resets >=4);
assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher
// Now with a numeric field
// But if we need to calculate domains anyway, it probably makes sense to calculate most stats in the 2nd phase (along with sub-facets)
creates = DebugAgg.Acc.creates.get();
resets = DebugAgg.Acc.resets.get();
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{terms:{${terms_method} field:${num_d}, limit:-1, facet:{ x:'debug()' , y:{terms:${where_s}} } }}}"
)
, "facets=="
);
assertEquals(1, DebugAgg.Acc.creates.get() - creates);
assertTrue( DebugAgg.Acc.resets.get() - resets >=4);
assertTrue( DebugAgg.Acc.last.numSlots <= 2 ); // probably 1, but could be higher
}
//////////////////////////////////////////////////////////////// end phase testing
}
@Test