SOLR-11725: use corrected sample formula to calc stdDev in JSON facets

* Both stdDev and variance uses corrected sample formula to compute
  the values. This is similar to StatsComponent
This commit is contained in:
Munendra S N 2020-03-19 18:57:56 +05:30
parent 126e4a61b8
commit e9d6c24fb7
9 changed files with 50 additions and 37 deletions

View File

@ -18,6 +18,9 @@ Improvements
* SOLR-14223: PKI Auth can bootstrap from existing key files instead of creating new keys on startup (Mike Drob)
* SOLR-11725: Use corrected sample formula for computing stdDev and variance in JSON aggregations
(hossman, Munendra S N, yonik)
Other Changes
----------------------
* SOLR-10288: Remove non-minified JavaScript from the webapp. (Erik Hatcher, marcussorealheis)

View File

@ -43,8 +43,8 @@ public class AggUtil {
* Computes and returns corrected standard deviation for given values
*/
public static double stdDev(double sumSq, double sum, long count) {
// todo: should we return NAN when count==0?
return count == 0 ? 0 : Math.sqrt(((count * sumSq) - (sum * sum)) / (count * (count - 1.0D)));
// todo: should we return NAN when count==0 or count==1?
return count <= 1 ? 0.0d : Math.sqrt(((count * sumSq) - (sum * sum)) / (count * (count - 1.0D)));
}
/**
@ -59,7 +59,7 @@ public class AggUtil {
* Computes and returns corrected variance for given values
*/
public static double variance(double sumSq, double sum, long count) {
// todo: should we return NAN when count==0?
return count == 0 ? 0 : ((count * sumSq) - (sum * sum)) / (count * (count - 1.0D));
// todo: should we return NAN when count==0 or count==1?
return count <= 1 ? 0.0d : ((count * sumSq) - (sum * sum)) / (count * (count - 1.0D));
}
}

View File

@ -484,7 +484,7 @@ class VarianceSlotAcc extends DoubleFuncSlotAcc {
}
private double variance(int slot) {
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
@Override
@ -541,7 +541,7 @@ class StddevSlotAcc extends DoubleFuncSlotAcc {
}
private double stdDev(int slot) {
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
@Override

View File

@ -86,7 +86,7 @@ public class StddevAgg extends SimpleAggValueSource {
@Override
protected double getDouble() {
return AggUtil.uncorrectedStdDev(sumSq, sum, count);
return AggUtil.stdDev(sumSq, sum, count);
}
}
@ -98,7 +98,7 @@ public class StddevAgg extends SimpleAggValueSource {
@Override
protected double computeVal(int slot) {
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
}
@ -110,7 +110,7 @@ public class StddevAgg extends SimpleAggValueSource {
@Override
protected double computeVal(int slot) {
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
}
@ -122,7 +122,7 @@ public class StddevAgg extends SimpleAggValueSource {
@Override
protected double computeVal(int slot) {
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
}
}

View File

@ -85,7 +85,7 @@ public class VarianceAgg extends SimpleAggValueSource {
@Override
protected double getDouble() {
return AggUtil.uncorrectedVariance(sumSq, sum, count);
return AggUtil.variance(sumSq, sum, count);
}
}
@ -97,7 +97,7 @@ public class VarianceAgg extends SimpleAggValueSource {
@Override
protected double computeVal(int slot) {
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
}
@ -109,7 +109,7 @@ public class VarianceAgg extends SimpleAggValueSource {
@Override
protected double computeVal(int slot) {
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
}
@ -121,7 +121,7 @@ public class VarianceAgg extends SimpleAggValueSource {
@Override
protected double computeVal(int slot) {
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
}
}
}

View File

@ -314,7 +314,7 @@ public class StatsComponentTest extends SolrTestCaseJ4 {
, json ? "//*" : "count(//arr[@name='distinctValues']/*)=8"
, "//double[@name='sumOfSquares'][.='53101.0']"
, "//double[@name='mean'][.='1.125']"
,json ? "//*" : "//double[@name='stddev'][.='87.08852228787508']" // SOLR-11725
,"//double[@name='stddev'][.='87.08852228787508']"
);
assertQ("test statistics values w/fq",
@ -329,8 +329,13 @@ public class StatsComponentTest extends SolrTestCaseJ4 {
, json ? "//*" : "count(//arr[@name='distinctValues']/*)=6"
, "//double[@name='sumOfSquares'][.='43001.0']"
, "//double[@name='mean'][.='19.833333333333332']"
, json ? "//*" : "//double[@name='stddev'][.='90.15634568163611']" // SOLR-11725
,"//double[@name='stddev'][.='90.15634568163611']"
);
assertQ("test stdDev",
req(baseParams, "q", "id:5", "rows", "0")
,"//double[@name='stddev'][.='0.0']"
);
if (!json) { // checking stats.facet makes no sense for json faceting
assertQ("test stats.facet (using boolean facet field)",

View File

@ -235,10 +235,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
assertEquals(1.0D, bucket.get("percentile"));
assertEquals(0.475247524752475D, (double) bucket.get("avg"), 0.1E-7);
assertEquals(54.0D, (double) bucket.get("sumsq"), 0.1E-7);
// assertEquals(0.55846323792D, (double) bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
// assertEquals(0.3118811881D, (double) bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
assertEquals(0.55569169111D, (double) bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
assertEquals(0.3087932556D, (double) bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
assertEquals(0.55846323792D, (double) bucket.get("stddev"), 0.1E-7);
assertEquals(0.3118811881D, (double) bucket.get("variance"), 0.1E-7);
assertEquals(3L, bucket.get("unique"));
assertEquals(3L, bucket.get("hll"));
}
@ -399,10 +397,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
assertEquals(483.70000000000016D, (double)aaa0_Bucket.get("percentile"), 0.1E-7);
assertEquals(115.5D, (double) aaa0_Bucket.get("avg"), 0.1E-7);
assertEquals(1.674585E7D, (double) aaa0_Bucket.get("sumsq"), 0.1E-7);
// assertEquals(206.4493184076D, (double) aaa0_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
// assertEquals(42621.32107023412D, (double) aaa0_Bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
assertEquals(206.1049489944D, (double) aaa0_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
assertEquals(42479.25D, (double) aaa0_Bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
assertEquals(206.4493184076D, (double) aaa0_Bucket.get("stddev"), 0.1E-7);
assertEquals(42621.32107023412D, (double) aaa0_Bucket.get("variance"), 0.1E-7);
assertEquals(284L, aaa0_Bucket.get("unique"));
assertEquals(284L, aaa0_Bucket.get("hll"));
@ -418,10 +414,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
assertEquals(1980.0D, tail_Bucket.get("sum"));
assertEquals(22.0D, (double) tail_Bucket.get("avg"), 0.1E-7);
assertEquals(58740.0D, (double) tail_Bucket.get("sumsq"), 0.1E-7);
// assertEquals(13.0599310011D, (double) tail_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
// assertEquals(170.5617977535D, (double) tail_Bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
assertEquals(12.9871731592D, (double) tail_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
assertEquals(168.666666667D, (double) tail_Bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
assertEquals(13.0599310011D, (double) tail_Bucket.get("stddev"), 0.1E-7);
assertEquals(170.5617977535D, (double) tail_Bucket.get("variance"), 0.1E-7);
assertEquals(45L, tail_Bucket.get("unique"));
assertEquals(45L, tail_Bucket.get("hll"));
@ -439,10 +433,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
assertEquals(450.0D, tailB_Bucket.get("sum"));
assertEquals(37.5D, (double) tailB_Bucket.get("avg"), 0.1E-7);
assertEquals(16910.0D, (double) tailB_Bucket.get("sumsq"), 0.1E-7);
// assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
// assertEquals(3.1818181817D, (double) tailB_Bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
assertEquals(1.70782513D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
assertEquals(2.9166666747D, (double) tailB_Bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7);
assertEquals(3.1818181817D, (double) tailB_Bucket.get("variance"), 0.1E-7);
assertEquals(6L, tailB_Bucket.get("unique"));
assertEquals(6L, tailB_Bucket.get("hll"));

View File

@ -1289,13 +1289,23 @@ public class TestJsonFacets extends SolrTestCaseHS {
", f2:{ 'buckets':[{ val:'B', count:3, x:11.0 }, { val:'A', count:2, x:4.0 }]} " +
", f3:{ 'buckets':[{ val:'A', count:2, x:2 }, { val:'B', count:3, x:2 }]} " +
", f4:{ 'buckets':[{ val:'A', count:2, x:2 }, { val:'B', count:3, x:2 }]} " +
", f5:{ 'buckets':[{ val:'B', count:3, x:74.6666666666666 }, { val:'A', count:2, x:1.0 }]} " +
", f5:{ 'buckets':[{ val:'B', count:3, x:112.0 }, { val:'A', count:2, x:2.0 }]} " +
", f6:{ buckets:[{ val:-9.0, count:1, x:1 }]} " +
", f7:{ buckets:[{ val:B, count:3, x:3 },{ val:A, count:2, x:0 }]} " +
", f8:{ buckets:[{ val:A, count:2, x:2 },{ val:B, count:3, x:0 }]} " +
"}"
);
// test for stdDev and variance of size 1 and 0
client.testJQ(params(p, "q", "id:1", "json.facet", "{n1:'stddev(${num_d})', n2: 'variance(${num_d})'}")
, "facets=={ 'count':1, " +
" n1:0.0, n2:0.0 }"
);
client.testJQ(params(p, "q", "id:3", "json.facet", "{n1:'stddev(${num_d})', n2: 'variance(${num_d})'}")
, "facets=={ 'count':1, " +
" n1:0.0, n2:0.0 }"
);
// test sorting by stat with function
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{terms:{${terms} field:'${cat_s}', sort:'n1 desc', facet:{n1:'avg(add(${num_d},${num_d}))'} }}" +
@ -1808,7 +1818,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
"sum1:3.0, sumsq1:247.0, avg1:0.6, avg2:0.5, mind:-9.0, maxd:11.0" +
", numwhere:2, unique_num_i:4, unique_num_d:5, unique_date:5" +
", where_hll:2, hll_num_i:4, hll_num_d:5, hll_date:5" +
", med:2.0, perc:[-9.0,2.0,11.0], variance:49.04, stddev:7.002856560004639" +
", med:2.0, perc:[-9.0,2.0,11.0], variance:61.3, stddev:7.829431652425353" +
", mini:-5, maxi:7, missing:4, vals:2" +
"}"
);
@ -1824,7 +1834,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
, "facets=={ 'count':6, " +
"sum1:0.0, sumsq1:51.5, avg1:0.0, mind:-5.0, maxd:3.0" +
", mini:-5, maxi:3, mins:'a', maxs:'b'" +
", stddev:2.537222891273055, variance:6.4375, median:0.0, perc:[-5.0,2.25,3.0]" +
", stddev:2.712405363721075, variance:7.3571428571, median:0.0, perc:[-5.0,2.25,3.0]" +
"}"
);
@ -2314,8 +2324,8 @@ public class TestJsonFacets extends SolrTestCaseHS {
", sumd:'sum(${num_d})', avgd:'avg(${num_d})', variance:'variance(${num_d})', stddev:'stddev(${num_d})', missing:'missing(${multi_ss})', vals:'countvals(${multi_ss})'} }}"
)
, "facets=={ 'count':6, " +
"'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, mini:-5, maxi:7, sumd:-3.0, avgd:-1.0, variance:74.66666666666667, stddev:8.640987597877148, missing:0, vals:5}," +
" {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, mini:2, maxi:3, sumd:6.0, avgd:3.0, variance:1.0, stddev:1.0, missing:1, vals:1}] } } "
"'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, mini:-5, maxi:7, sumd:-3.0, avgd:-1.0, variance:112.0, stddev:10.583005244258363, missing:0, vals:5}," +
" {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, mini:2, maxi:3, sumd:6.0, avgd:3.0, variance:2.0, stddev:1.4142135623730951, missing:1, vals:1}] } } "
);

View File

@ -90,6 +90,9 @@ _(raw; not yet edited)_
This embedded zookeeper should not be used in production. If you rely
upon the previous behavior, then you can change the clientPortAddress
in solr/server/solr/zoo.cfg (Robert Muir)
* SOLR-11725: JSON aggregations uses corrected sample formula to compute standard deviation and variance.
The computation of stdDev and variance in JSON aggregation is same as StatsComponent. (hossman, Munendra S N, yonik)
=== Upgrade Prerequisites in Solr 9