SOLR-11317: min/max aggs use integral values for integral fields

This commit is contained in:
yonik 2017-09-05 13:33:08 -04:00
parent 96150badce
commit 723ca96bc0
4 changed files with 213 additions and 41 deletions

View File

@ -83,6 +83,10 @@ New Features
* SOLR-11244: Query DSL for Solr (Cao Manh Dat)
* SOLR-11317: JSON Facet API: min/max aggregations on numeric fields are now typed better so int/long
fields return an appropriate integral type rather than a double. (yonik)
Bug Fixes
----------------------

View File

@ -25,7 +25,9 @@ import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LongValues;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.StrFieldSource;
import org.apache.solr.search.function.FieldNameValueSource;
@ -48,29 +50,39 @@ public class MinMaxAgg extends SimpleAggValueSource {
String field = ((FieldNameValueSource)vs).getFieldName();
sf = fcontext.qcontext.searcher().getSchema().getField(field);
vs = sf.getType().getValueSource(sf, null); // temporary implementation to make existing code work
if (sf.multiValued() || sf.getType().multiValuedFieldCache()) {
vs = null;
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "min/max aggregations can't be used on multi-valued field " + field);
} else {
vs = sf.getType().getValueSource(sf, null);
}
}
if (vs instanceof StrFieldSource) {
if (sf.multiValued() || sf.getType().multiValuedFieldCache()) {
if (sf.hasDocValues()) {
// dv
} else {
// uif
}
} else {
return new SingleValuedOrdAcc(fcontext, sf, numSlots);
}
// Since functions don't currently have types, we rely on the type of the field
if (sf != null && sf.getType().getNumberType() != null) {
switch (sf.getType().getNumberType()) {
case FLOAT:
case DOUBLE:
return new DFuncAcc(vs, fcontext, numSlots);
case INTEGER:
case LONG:
case DATE:
return new LFuncAcc(vs, fcontext, numSlots);
}
}
// numeric functions
return new ValSlotAcc(vs, fcontext, numSlots);
return new DFuncAcc(vs, fcontext, numSlots);
}
@Override
public FacetMerger createFacetMerger(Object prototype) {
if (prototype instanceof Number)
return new NumericMerger();
if (prototype instanceof Double)
return new NumericMerger(); // still use NumericMerger to handle NaN?
else if (prototype instanceof Comparable) {
return new ComparableMerger();
} else {
@ -122,8 +134,8 @@ public class MinMaxAgg extends SimpleAggValueSource {
}
}
class ValSlotAcc extends DoubleFuncSlotAcc {
public ValSlotAcc(ValueSource values, FacetContext fcontext, int numSlots) {
class DFuncAcc extends DoubleFuncSlotAcc {
public DFuncAcc(ValueSource values, FacetContext fcontext, int numSlots) {
super(values, fcontext, numSlots, Double.NaN);
}
@ -149,6 +161,66 @@ public class MinMaxAgg extends SimpleAggValueSource {
}
}
class LFuncAcc extends LongFuncSlotAcc {
FixedBitSet exists;
public LFuncAcc(ValueSource values, FacetContext fcontext, int numSlots) {
super(values, fcontext, numSlots, 0);
exists = new FixedBitSet(numSlots);
}
@Override
public void collect(int doc, int slotNum) throws IOException {
long val = values.longVal(doc);
if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query
long currVal = result[slotNum];
if (currVal == 0 && !exists.get(slotNum)) {
exists.set(slotNum);
result[slotNum] = val;
} else if (Long.compare(val, currVal) * minmax < 0) {
result[slotNum] = val;
}
}
@Override
public Object getValue(int slot) {
long val = result[slot];
if (val == 0 && exists.get(slot)) {
return null;
} else {
return val;
}
}
@Override
public void resize(Resizer resizer) {
super.resize(resizer);
exists = resizer.resize(exists);
}
@Override
public int compare(int slotA, int slotB) {
long a = result[slotA];
long b = result[slotB];
boolean ea = a != 0 || exists.get(slotA);
boolean eb = b != 0 || exists.get(slotB);
if (ea != eb) {
if (ea) return 1; // a exists and b doesn't TODO: we need context to be able to sort missing last! SOLR-10618
if (eb) return -1; // b exists and a is missing
}
return Long.compare(a, b);
}
@Override
public void reset() {
super.reset();
exists.clear(0, exists.length());
}
}
abstract class OrdAcc extends SlotAcc {
final static int MISSING = -1;

View File

@ -16,14 +16,6 @@
*/
package org.apache.solr.search.facet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
import java.io.Closeable;
import java.io.IOException;
import java.lang.reflect.Array;
@ -32,6 +24,16 @@ import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
/**
* Accumulates statistics separated by a slot number.
* There is a separate statistic per slot. The slot is usually an ordinal into a set of values, e.g. tracking a count
@ -140,6 +142,38 @@ public abstract class SlotAcc implements Closeable {
return values;
}
public long[] resize(long[] old, long defaultValue) {
long[] values = new long[getNewSize()];
if (defaultValue != 0) {
Arrays.fill(values, 0, values.length, defaultValue);
}
for (int i = 0; i < old.length; i++) {
long val = old[i];
if (val != defaultValue) {
int newSlot = getNewSlot(i);
if (newSlot >= 0) {
values[newSlot] = val;
}
}
}
return values;
}
public FixedBitSet resize(FixedBitSet old) {
FixedBitSet values = new FixedBitSet(getNewSize());
int oldSize = old.length();
for(int oldSlot = 0;;) {
oldSlot = values.nextSetBit(oldSlot);
if (oldSlot == DocIdSetIterator.NO_MORE_DOCS) break;
int newSlot = getNewSlot(oldSlot);
values.set(newSlot);
if (++oldSlot >= oldSize) break;
}
return values;
}
public <T> T[] resize(T[] old, T defaultValue) {
T[] values = (T[]) Array.newInstance(old.getClass().getComponentType(), getNewSize());
if (defaultValue != null) {
@ -222,6 +256,40 @@ abstract class DoubleFuncSlotAcc extends FuncSlotAcc {
}
}
abstract class LongFuncSlotAcc extends FuncSlotAcc {
long[] result;
long initialValue;
public LongFuncSlotAcc(ValueSource values, FacetContext fcontext, int numSlots, long initialValue) {
super(values, fcontext, numSlots);
this.initialValue = initialValue;
result = new long[numSlots];
if (initialValue != 0) {
reset();
}
}
@Override
public int compare(int slotA, int slotB) {
return Long.compare(result[slotA], result[slotB]);
}
@Override
public Object getValue(int slot) {
return result[slot];
}
@Override
public void reset() {
Arrays.fill(result, initialValue);
}
@Override
public void resize(Resizer resizer) {
result = resizer.resize(result, initialValue);
}
}
abstract class IntSlotAcc extends SlotAcc {
int[] result; // use LongArray32
int initialValue;

View File

@ -467,28 +467,29 @@ public class TestJsonFacets extends SolrTestCaseHS {
// single valued strings
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_s", "cat_s","cat_s", "where_s","where_s", "num_d","num_d", "num_i","num_i", "super_s","super_s", "val_b","val_b", "date","date_dt", "sparse_s","sparse_s" ,"multi_ss","multi_ss") );
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_s", "cat_s","cat_s", "where_s","where_s", "num_d","num_d", "num_i","num_i", "num_l","long_l", "super_s","super_s", "val_b","val_b", "date","date_dt", "sparse_s","sparse_s" ,"multi_ss","multi_ss") );
// multi-valued strings, long/float substitute for int/double
doStatsTemplated(client, params(p, "facet","true", "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") );
doStatsTemplated(client, params(p, "facet","true", "rows","0", "noexist","noexist_ss", "cat_s","cat_ss", "where_s","where_ss", "num_d","num_f", "num_i","num_l", "num_l","long_l", "num_is","num_ls", "num_fs", "num_ds", "super_s","super_ss", "val_b","val_b", "date","date_dt", "sparse_s","sparse_ss", "multi_ss","multi_ss") );
// multi-valued strings, method=dv for terms facets
doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));
doStatsTemplated(client, params(p, "terms_method", "method:dv,", "rows", "0", "noexist", "noexist_ss", "cat_s", "cat_ss", "where_s", "where_ss", "num_d", "num_f", "num_i", "num_l", "num_l","long_l","super_s", "super_ss", "val_b", "val_b", "date", "date_dt", "sparse_s", "sparse_ss", "multi_ss", "multi_ss"));
// single valued docvalues for strings, and single valued numeric doc values for numeric fields
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sd", "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd" ,"multi_ss","multi_sds") );
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sd", "cat_s","cat_sd", "where_s","where_sd", "num_d","num_dd", "num_i","num_id", "num_is","num_lds", "num_l","long_ld", "num_fs","num_dds", "super_s","super_sd", "val_b","val_b", "date","date_dtd", "sparse_s","sparse_sd" ,"multi_ss","multi_sds") );
// multi-valued docvalues
FacetFieldProcessorByArrayDV.unwrap_singleValued_multiDv = false; // better multi-valued coverage
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") );
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_l","long_ld", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") );
// multi-valued docvalues
FacetFieldProcessorByArrayDV.unwrap_singleValued_multiDv = true;
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") );
doStatsTemplated(client, params(p, "rows","0", "noexist","noexist_sds", "cat_s","cat_sds", "where_s","where_sds", "num_d","num_d", "num_i","num_i", "num_is","num_ids", "num_l","long_ld", "num_fs","num_fds", "super_s","super_sds", "val_b","val_b", "date","date_dtds", "sparse_s","sparse_sds" ,"multi_ss","multi_sds") );
}
public static void doStatsTemplated(Client client, ModifiableSolrParams p) throws Exception {
p.set("Z_num_i", "Z_" + p.get("num_i") );
p.set("Z_num_l", "Z_" + p.get("num_l") );
p.set("sparse_num_d", "sparse_" + p.get("num_d") );
if (p.get("num_is") == null) p.add("num_is","num_is");
if (p.get("num_fs") == null) p.add("num_fs","num_fs");
@ -528,6 +529,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
String num_is = m.expand("${num_is}");
String num_fs = m.expand("${num_fs}");
String Z_num_i = m.expand("${Z_num_i}");
String Z_num_l = m.expand("${Z_num_l}");
String val_b = m.expand("${val_b}");
String date = m.expand("${date}");
String super_s = m.expand("${super_s}");
@ -553,13 +555,13 @@ public class TestJsonFacets extends SolrTestCaseHS {
iclient.add(doc, null);
iclient.add(doc, null);
iclient.add(doc, null); // a couple of deleted docs
iclient.add(sdoc("id", "2", cat_s, "B", where_s, "NJ", num_d, "-9", num_i, "-5", num_is,"3",num_is,"-1", num_fs,"3",num_fs,"-1.5", super_s,"superman", date,"2002-02-02T02:02:02Z", val_b, "false" , multi_ss,"a", multi_ss,"b" , Z_num_i, "0"), null);
iclient.add(sdoc("id", "2", cat_s, "B", where_s, "NJ", num_d, "-9", num_i, "-5", num_is,"3",num_is,"-1", num_fs,"3",num_fs,"-1.5", super_s,"superman", date,"2002-02-02T02:02:02Z", val_b, "false" , multi_ss,"a", multi_ss,"b" , Z_num_i, "0", Z_num_l,"0"), null);
iclient.add(sdoc("id", "3"), null);
iclient.commit();
iclient.add(sdoc("id", "4", cat_s, "A", where_s, "NJ", num_d, "2", sparse_num_d,"-4",num_i, "3", num_is,"0",num_is,"3", num_fs,"0", num_fs,"3", super_s,"spiderman", date,"2003-03-03T03:03:03Z" , multi_ss, "b", Z_num_i, ""+Integer.MIN_VALUE), null);
iclient.add(sdoc("id", "4", cat_s, "A", where_s, "NJ", num_d, "2", sparse_num_d,"-4",num_i, "3", num_is,"0",num_is,"3", num_fs,"0", num_fs,"3", super_s,"spiderman", date,"2003-03-03T03:03:03Z" , multi_ss, "b", Z_num_i, ""+Integer.MIN_VALUE, Z_num_l,Long.MIN_VALUE), null);
iclient.add(sdoc("id", "5", cat_s, "B", where_s, "NJ", num_d, "11", num_i, "7", num_is,"0", num_fs,"0", super_s,"batman" , date,"2001-02-03T01:02:03Z" ,sparse_s,"two", multi_ss, "a"), null);
iclient.commit();
iclient.add(sdoc("id", "6", cat_s, "B", where_s, "NY", num_d, "-5", num_i, "-5", num_is,"-1", num_fs,"-1.5", super_s,"hulk" , date,"2002-03-01T03:02:01Z" , multi_ss, "b", multi_ss, "a", Z_num_i, ""+Integer.MAX_VALUE), null);
iclient.add(sdoc("id", "6", cat_s, "B", where_s, "NY", num_d, "-5", num_i, "-5", num_is,"-1", num_fs,"-1.5", super_s,"hulk" , date,"2002-03-01T03:02:01Z" , multi_ss, "b", multi_ss, "a", Z_num_i, ""+Integer.MAX_VALUE, Z_num_l,Long.MAX_VALUE), null);
iclient.commit();
client.commit();
@ -685,7 +687,18 @@ public class TestJsonFacets extends SolrTestCaseHS {
", f2:{ 'buckets':[{ val:'B', count:3, n1:-2.0}, { val:'A', count:2, n1:6.0 }]} }"
);
// facet on numbers to test resize from hashing (may need to be sorting by the metric to test that)
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{" +
" f1:{${terms} type:field, field:${num_is}, facet:{a:'min(${num_i})'}, sort:'a asc' }" +
",f2:{${terms} type:field, field:${num_is}, facet:{a:'max(${num_i})'}, sort:'a desc' }" +
"}"
)
, "facets=={count:6 " +
",f1:{ buckets:[{val:-1,count:2,a:-5},{val:3,count:2,a:-5},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:0,count:2,a:3}, ] } " +
",f2:{ buckets:[{val:0,count:2,a:7},{val:3,count:2,a:3},{val:-5,count:1,a:2},{val:2,count:1,a:2},{val:-1,count:2,a:-5}, ] } " +
"}"
);
// percentiles 0,10,50,90,100
// catA: 2.0 2.2 3.0 3.8 4.0
@ -983,16 +996,20 @@ public class TestJsonFacets extends SolrTestCaseHS {
// stats at top level
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', avg2:'avg(def(${num_d},0))', min1:'min(${num_d})', max1:'max(${num_d})'" +
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', avg2:'avg(def(${num_d},0))', mind:'min(${num_d})', maxd:'max(${num_d})'" +
", numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})'" +
", where_hll:'hll(${where_s})', hll_num_i:'hll(${num_i})', hll_num_d:'hll(${num_d})', hll_date:'hll(${date})'" +
", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)', variance:'variance(${num_d})', stddev:'stddev(${num_d})' }"
", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)', variance:'variance(${num_d})', stddev:'stddev(${num_d})'" +
", mini:'min(${num_i})', maxi:'max(${num_i})'" +
" }"
)
, "facets=={ 'count':6, " +
"sum1:3.0, sumsq1:247.0, avg1:0.6, avg2:0.5, min1:-9.0, max1:11.0" +
"sum1:3.0, sumsq1:247.0, avg1:0.6, avg2:0.5, mind:-9.0, maxd:11.0" +
", numwhere:2, unique_num_i:4, unique_num_d:5, unique_date:5" +
", where_hll:2, hll_num_i:4, hll_num_d:5, hll_date:5" +
", med:2.0, perc:[-9.0,2.0,11.0], variance:49.04, stddev:7.002856560004639}"
", med:2.0, perc:[-9.0,2.0,11.0], variance:49.04, stddev:7.002856560004639" +
", mini:-5, maxi:7" +
"}"
);
// stats at top level, no matches
@ -1307,16 +1324,26 @@ public class TestJsonFacets extends SolrTestCaseHS {
"}"
);
// test 0, min/max int
// test 0, min/max int/long
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{" +
" u : 'unique(${Z_num_i})'" +
", u2 : 'unique(${Z_num_l})'" +
", min1 : 'min(${Z_num_i})', max1 : 'max(${Z_num_i})'" +
", min2 : 'min(${Z_num_l})', max2 : 'max(${Z_num_l})'" +
", f1:{${terms} type:field, field:${Z_num_i} }" +
", f2:{${terms} type:field, field:${Z_num_l} }" +
"}"
)
, "facets=={count:6 " +
",u:3" +
",u2:3" +
",min1:" + Integer.MIN_VALUE +
",max1:" + Integer.MAX_VALUE +
",min2:" + Long.MIN_VALUE +
",max2:" + Long.MAX_VALUE +
",f1:{ buckets:[{val:" + Integer.MIN_VALUE + ",count:1},{val:0,count:1},{val:" + Integer.MAX_VALUE+",count:1}]} " +
",f2:{ buckets:[{val:" + Long.MIN_VALUE + ",count:1},{val:0,count:1},{val:" + Long.MAX_VALUE+",count:1}]} " +
"}"
);
@ -1394,11 +1421,12 @@ public class TestJsonFacets extends SolrTestCaseHS {
// test acc reuse (i.e. reset() method). This is normally used for stats that are not calculated in the first phase,
// currently non-sorting stats.
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{type:terms, field:'${cat_s}', facet:{h:'hll(${where_s})' , u:'unique(${where_s})', mind:'min(${num_d})', maxd:'max(${num_d})', sumd:'sum(${num_d})', avgd:'avg(${num_d})', variance:'variance(${num_d})', stddev:'stddev(${num_d})' } }}"
, "json.facet", "{f1:{type:terms, field:'${cat_s}', facet:{h:'hll(${where_s})' , u:'unique(${where_s})', mind:'min(${num_d})', maxd:'max(${num_d})', mini:'min(${num_i})', maxi:'max(${num_i})'" +
", sumd:'sum(${num_d})', avgd:'avg(${num_d})', variance:'variance(${num_d})', stddev:'stddev(${num_d})' } }}"
)
, "facets=={ 'count':6, " +
"'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, sumd:-3.0, avgd:-1.0, variance:74.66666666666667, stddev:8.640987597877148}," +
" {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, sumd:6.0, avgd:3.0, variance:1.0, stddev:1.0}] } } "
"'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, mini:-5, maxi:7, sumd:-3.0, avgd:-1.0, variance:74.66666666666667, stddev:8.640987597877148}," +
" {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, mini:2, maxi:3, sumd:6.0, avgd:3.0, variance:1.0, stddev:1.0}] } } "
);