From 3145f781b3b55ebbd6374f40db7fe7b676b8d1af Mon Sep 17 00:00:00 2001 From: yonik Date: Mon, 17 Apr 2017 22:30:29 -0400 Subject: [PATCH] SOLR-10082: JSON Facet API, add stddev and variance functions --- solr/CHANGES.txt | 4 + .../apache/solr/search/ValueSourceParser.java | 23 ++- .../org/apache/solr/search/facet/SlotAcc.java | 181 ++++++++++++++---- .../apache/solr/search/facet/StddevAgg.java | 66 +++++++ .../apache/solr/search/facet/VarianceAgg.java | 65 +++++++ .../apache/solr/search/QueryEqualityTest.java | 3 +- .../solr/search/facet/TestJsonFacets.java | 19 +- 7 files changed, 311 insertions(+), 50 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/search/facet/StddevAgg.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/VarianceAgg.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index f2c1c8ad657..294cdccb646 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -167,6 +167,10 @@ New Features initializing CloudSolrClient would not work if you have collection aliases on older versions of Solr server that doesn't support LISTALIASES. (Ishan Chattopadhyaya, Noble Paul) +* SOLR-10082: Variance and Standard Deviation aggregators for the JSON Facet API. + Example: json.facet={x:"stddev(field1)", y:"variance(field2)"} + (Rustam Hashimov, yonik) + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java index c2b8a5d4118..b802c41d7b3 100644 --- a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java +++ b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java @@ -58,9 +58,11 @@ import org.apache.solr.search.facet.HLLAgg; import org.apache.solr.search.facet.MaxAgg; import org.apache.solr.search.facet.MinAgg; import org.apache.solr.search.facet.PercentileAgg; +import org.apache.solr.search.facet.StddevAgg; import org.apache.solr.search.facet.SumAgg; import org.apache.solr.search.facet.SumsqAgg; import org.apache.solr.search.facet.UniqueAgg; +import org.apache.solr.search.facet.VarianceAgg; import org.apache.solr.search.function.CollapseScoreFunction; import org.apache.solr.search.function.OrdFieldSource; import org.apache.solr.search.function.ReverseOrdFieldSource; @@ -931,14 +933,21 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin { } }); - /*** - addParser("agg_stdev", new ValueSourceParser() { - @Override - public ValueSource parse(FunctionQParser fp) throws SyntaxError { - return null; - } + addParser("agg_variance", new ValueSourceParser() { + @Override + public ValueSource parse(FunctionQParser fp) throws SyntaxError { + return new VarianceAgg(fp.parseValueSource()); + } }); - + + addParser("agg_stddev", new ValueSourceParser() { + @Override + public ValueSource parse(FunctionQParser fp) throws SyntaxError { + return new StddevAgg(fp.parseValueSource()); + } + }); + + /*** addParser("agg_multistat", new ValueSourceParser() { @Override public ValueSource parse(FunctionQParser fp) throws SyntaxError { diff --git a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java index 3da3541b7ed..1d8aecb2e5f 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java @@ -33,7 +33,7 @@ import java.util.Iterator; import java.util.List; /** - * Accumulates statistics separated by a slot number. + * Accumulates statistics separated by a slot number. * There is a separate statistic per slot. The slot is usually an ordinal into a set of values, e.g. tracking a count * frequency per term. * Sometimes there doesn't need to be a slot distinction, in which case there is just one nominal slot. @@ -46,8 +46,7 @@ public abstract class SlotAcc implements Closeable { this.fcontext = fcontext; } - public void setNextReader(LeafReaderContext readerContext) throws IOException { - } + public void setNextReader(LeafReaderContext readerContext) throws IOException {} public abstract void collect(int doc, int slot) throws IOException; @@ -61,7 +60,7 @@ public abstract class SlotAcc implements Closeable { int segBase = 0; int segMax; int adjustedMax = 0; - for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) { + for (DocIterator docsIt = docs.iterator(); docsIt.hasNext();) { final int doc = docsIt.nextDoc(); if (doc >= adjustedMax) { do { @@ -78,12 +77,11 @@ public abstract class SlotAcc implements Closeable { setNextReader(ctx); } count++; - collect(doc - segBase, slot); // per-seg collectors + collect(doc - segBase, slot); // per-seg collectors } return count; } - public abstract int compare(int slotA, int slotB); public abstract Object getValue(int slotNum) throws IOException; @@ -101,8 +99,7 @@ public abstract class SlotAcc implements Closeable { public abstract void resize(Resizer resizer); @Override - public void close() throws IOException { - } + public void close() throws IOException {} public static abstract class Resizer { public abstract int getNewSize(); @@ -181,15 +178,14 @@ abstract class FuncSlotAcc extends SlotAcc { } } - -// have a version that counts the number of times a Slot has been hit? (for avg... what else?) +// have a version that counts the number of times a Slot has been hit? (for avg... what else?) // TODO: make more sense to have func as the base class rather than double? // double-slot-func -> func-slot -> slot -> acc // double-slot-func -> double-slot -> slot -> acc abstract class DoubleFuncSlotAcc extends FuncSlotAcc { - double[] result; // TODO: use DoubleArray + double[] result; // TODO: use DoubleArray double initialValue; public DoubleFuncSlotAcc(ValueSource values, FacetContext fcontext, int numSlots) { @@ -210,7 +206,6 @@ abstract class DoubleFuncSlotAcc extends FuncSlotAcc { return Double.compare(result[slotA], result[slotB]); } - @Override public Object getValue(int slot) { return result[slot]; @@ -228,7 +223,7 @@ abstract class DoubleFuncSlotAcc extends FuncSlotAcc { } abstract class IntSlotAcc extends SlotAcc { - int[] result; // use LongArray32 + int[] result; // use LongArray32 int initialValue; public IntSlotAcc(FacetContext fcontext, int numSlots, int initialValue) { @@ -261,15 +256,13 @@ abstract class IntSlotAcc extends SlotAcc { } } - - class SumSlotAcc extends DoubleFuncSlotAcc { public SumSlotAcc(ValueSource values, FacetContext fcontext, int numSlots) { super(values, fcontext, numSlots); } public void collect(int doc, int slotNum) throws IOException { - double val = values.doubleVal(doc); // todo: worth trying to share this value across multiple stats that need it? + double val = values.doubleVal(doc); // todo: worth trying to share this value across multiple stats that need it? result[slotNum] += val; } } @@ -287,8 +280,6 @@ class SumsqSlotAcc extends DoubleFuncSlotAcc { } } - - class MinSlotAcc extends DoubleFuncSlotAcc { public MinSlotAcc(ValueSource values, FacetContext fcontext, int numSlots) { super(values, fcontext, numSlots, Double.NaN); @@ -297,10 +288,10 @@ class MinSlotAcc extends DoubleFuncSlotAcc { @Override public void collect(int doc, int slotNum) throws IOException { double val = values.doubleVal(doc); - if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query + if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query double currMin = result[slotNum]; - if (!(val >= currMin)) { // val>=currMin will be false for staring value: val>=NaN + if (!(val >= currMin)) { // val>=currMin will be false for staring value: val>=NaN result[slotNum] = val; } } @@ -314,17 +305,16 @@ class MaxSlotAcc extends DoubleFuncSlotAcc { @Override public void collect(int doc, int slotNum) throws IOException { double val = values.doubleVal(doc); - if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query + if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query double currMax = result[slotNum]; - if (!(val <= currMax)) { // reversed order to handle NaN + if (!(val <= currMax)) { // reversed order to handle NaN result[slotNum] = val; } } } - class AvgSlotAcc extends DoubleFuncSlotAcc { int[] counts; @@ -336,7 +326,7 @@ class AvgSlotAcc extends DoubleFuncSlotAcc { @Override public void reset() { super.reset(); - for (int i=0; i numberList = (List)facetResult; + this.count += numberList.get(0).longValue(); + this.sumSq += numberList.get(1).doubleValue(); + this.sum += numberList.get(2).doubleValue(); + } + + @Override + public Object getMergedResult() { + return this.getDouble(); + } + + @Override + protected double getDouble() { + double val = count == 0 ? 0.0d : Math.sqrt((sumSq/count)-Math.pow(sum/count, 2)); + return val; + } + }; +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/VarianceAgg.java b/solr/core/src/java/org/apache/solr/search/facet/VarianceAgg.java new file mode 100644 index 00000000000..ec6955f4663 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/VarianceAgg.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.queries.function.ValueSource; + + +public class VarianceAgg extends SimpleAggValueSource { + public VarianceAgg(ValueSource vs) { + super("variance", vs); + } + + @Override + public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) throws IOException { + return new VarianceSlotAcc(getArg(), fcontext, numSlots); + } + + @Override + public FacetMerger createFacetMerger(Object prototype) { + return new Merger(); + } + + private static class Merger extends FacetDoubleMerger { + long count; + double sumSq; + double sum; + + @Override + @SuppressWarnings("unchecked") + public void merge(Object facetResult, Context mcontext1) { + List numberList = (List)facetResult; + this.count += numberList.get(0).longValue(); + this.sumSq += numberList.get(1).doubleValue(); + this.sum += numberList.get(2).doubleValue(); + } + + @Override + public Object getMergedResult() { + return this.getDouble(); + } + + @Override + protected double getDouble() { + double val = count == 0 ? 0.0d : (sumSq/count)-Math.pow(sum/count, 2); + return val; + } + }; +} diff --git a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java index a9b5c6ee198..22d758614e3 100644 --- a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java +++ b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java @@ -1099,7 +1099,8 @@ public class QueryEqualityTest extends SolrTestCaseJ4 { assertFuncEquals("agg_hll(foo_i)", "agg_hll(foo_i)"); assertFuncEquals("agg_sumsq(foo_i)", "agg_sumsq(foo_i)"); assertFuncEquals("agg_percentile(foo_i,50)", "agg_percentile(foo_i,50)"); - // assertFuncEquals("agg_stdev(foo_i)", "agg_stdev(foo_i)"); + assertFuncEquals("agg_variance(foo_i)", "agg_variance(foo_i)"); + assertFuncEquals("agg_stddev(foo_i)", "agg_stddev(foo_i)"); // assertFuncEquals("agg_multistat(foo_i)", "agg_multistat(foo_i)"); } diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index 95c403a78eb..bad3de5485f 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -529,6 +529,7 @@ public class TestJsonFacets extends SolrTestCaseHS { " , f2:{${terms} type:terms, field:'${cat_s}', sort:'x desc', facet:{x:'max(${num_d})'} } " + " , f3:{${terms} type:terms, field:'${cat_s}', sort:'x desc', facet:{x:'unique(${where_s})'} } " + " , f4:{${terms} type:terms, field:'${cat_s}', sort:'x desc', facet:{x:'hll(${where_s})'} } " + + " , f5:{${terms} type:terms, field:'${cat_s}', sort:'x desc', facet:{x:'variance(${num_d})'} } " + "}" ) , "facets=={ 'count':6, " + @@ -536,6 +537,7 @@ public class TestJsonFacets extends SolrTestCaseHS { ", f2:{ 'buckets':[{ val:'B', count:3, x:11.0 }, { val:'A', count:2, x:4.0 }]} " + ", f3:{ 'buckets':[{ val:'A', count:2, x:2 }, { val:'B', count:3, x:2 }]} " + ", f4:{ 'buckets':[{ val:'A', count:2, x:2 }, { val:'B', count:3, x:2 }]} " + + ", f5:{ 'buckets':[{ val:'B', count:3, x:74.6666666666666 }, { val:'A', count:2, x:1.0 }]} " + "}" ); @@ -845,19 +847,18 @@ public class TestJsonFacets extends SolrTestCaseHS { ); - // stats at top level client.testJQ(params(p, "q", "*:*" , "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', avg2:'avg(def(${num_d},0))', min1:'min(${num_d})', max1:'max(${num_d})'" + ", numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})'" + ", where_hll:'hll(${where_s})', hll_num_i:'hll(${num_i})', hll_num_d:'hll(${num_d})', hll_date:'hll(${date})'" + - ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }" + ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)', variance:'variance(${num_d})', stddev:'stddev(${num_d})' }" ) , "facets=={ 'count':6, " + "sum1:3.0, sumsq1:247.0, avg1:0.6, avg2:0.5, min1:-9.0, max1:11.0" + ", numwhere:2, unique_num_i:4, unique_num_d:5, unique_date:5" + ", where_hll:2, hll_num_i:4, hll_num_d:5, hll_date:5" + - ", med:2.0, perc:[-9.0,2.0,11.0] }" + ", med:2.0, perc:[-9.0,2.0,11.0], variance:49.04, stddev:7.002856560004639}" ); // stats at top level, no matches @@ -865,21 +866,20 @@ public class TestJsonFacets extends SolrTestCaseHS { , "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})'" + ", numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})'" + ", where_hll:'hll(${where_s})', hll_num_i:'hll(${num_i})', hll_num_d:'hll(${num_d})', hll_date:'hll(${date})'" + - ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }" + ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)', variance:'variance(${num_d})', stddev:'stddev(${num_d})' }" ) , "facets=={count:0 " + - "/* ,sum1:0.0, sumsq1:0.0, avg1:0.0, min1:'NaN', max1:'NaN', numwhere:0 */" + + "\n// ,sum1:0.0, sumsq1:0.0, avg1:0.0, min1:'NaN', max1:'NaN', numwhere:0 \n" + " }" ); - // stats at top level, matching documents, but no values in the field // NOTE: this represents the current state of what is returned, not the ultimate desired state. client.testJQ(params(p, "q", "id:3" , "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})'" + ", numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})'" + ", where_hll:'hll(${where_s})', hll_num_i:'hll(${num_i})', hll_num_d:'hll(${num_d})', hll_date:'hll(${date})'" + - ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }" + ", med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)', variance:'variance(${num_d})', stddev:'stddev(${num_d})' }" ) , "facets=={count:1 " + ",sum1:0.0," + @@ -894,11 +894,12 @@ public class TestJsonFacets extends SolrTestCaseHS { " where_hll:0," + " hll_num_i:0," + " hll_num_d:0," + - " hll_date:0" + + " hll_date:0," + + " variance:0.0," + + " stddev:0.0" + " }" ); - // // tests on a multi-valued field with actual multiple values, just to ensure that we are // using a multi-valued method for the rest of the tests when appropriate.