mirror of https://github.com/apache/lucene.git
SOLR-11725: use corrected sample formula to calc stdDev in JSON facets
* Both stdDev and variance uses corrected sample formula to compute the values. This is similar to StatsComponent
This commit is contained in:
parent
126e4a61b8
commit
e9d6c24fb7
|
@ -18,6 +18,9 @@ Improvements
|
|||
|
||||
* SOLR-14223: PKI Auth can bootstrap from existing key files instead of creating new keys on startup (Mike Drob)
|
||||
|
||||
* SOLR-11725: Use corrected sample formula for computing stdDev and variance in JSON aggregations
|
||||
(hossman, Munendra S N, yonik)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
* SOLR-10288: Remove non-minified JavaScript from the webapp. (Erik Hatcher, marcussorealheis)
|
||||
|
|
|
@ -43,8 +43,8 @@ public class AggUtil {
|
|||
* Computes and returns corrected standard deviation for given values
|
||||
*/
|
||||
public static double stdDev(double sumSq, double sum, long count) {
|
||||
// todo: should we return NAN when count==0?
|
||||
return count == 0 ? 0 : Math.sqrt(((count * sumSq) - (sum * sum)) / (count * (count - 1.0D)));
|
||||
// todo: should we return NAN when count==0 or count==1?
|
||||
return count <= 1 ? 0.0d : Math.sqrt(((count * sumSq) - (sum * sum)) / (count * (count - 1.0D)));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -59,7 +59,7 @@ public class AggUtil {
|
|||
* Computes and returns corrected variance for given values
|
||||
*/
|
||||
public static double variance(double sumSq, double sum, long count) {
|
||||
// todo: should we return NAN when count==0?
|
||||
return count == 0 ? 0 : ((count * sumSq) - (sum * sum)) / (count * (count - 1.0D));
|
||||
// todo: should we return NAN when count==0 or count==1?
|
||||
return count <= 1 ? 0.0d : ((count * sumSq) - (sum * sum)) / (count * (count - 1.0D));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -484,7 +484,7 @@ class VarianceSlotAcc extends DoubleFuncSlotAcc {
|
|||
}
|
||||
|
||||
private double variance(int slot) {
|
||||
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -541,7 +541,7 @@ class StddevSlotAcc extends DoubleFuncSlotAcc {
|
|||
}
|
||||
|
||||
private double stdDev(int slot) {
|
||||
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -86,7 +86,7 @@ public class StddevAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double getDouble() {
|
||||
return AggUtil.uncorrectedStdDev(sumSq, sum, count);
|
||||
return AggUtil.stdDev(sumSq, sum, count);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -98,7 +98,7 @@ public class StddevAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double computeVal(int slot) {
|
||||
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,7 +110,7 @@ public class StddevAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double computeVal(int slot) {
|
||||
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -122,7 +122,7 @@ public class StddevAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double computeVal(int slot) {
|
||||
return AggUtil.uncorrectedStdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.stdDev(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -85,7 +85,7 @@ public class VarianceAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double getDouble() {
|
||||
return AggUtil.uncorrectedVariance(sumSq, sum, count);
|
||||
return AggUtil.variance(sumSq, sum, count);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,7 +97,7 @@ public class VarianceAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double computeVal(int slot) {
|
||||
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -109,7 +109,7 @@ public class VarianceAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double computeVal(int slot) {
|
||||
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -121,7 +121,7 @@ public class VarianceAgg extends SimpleAggValueSource {
|
|||
|
||||
@Override
|
||||
protected double computeVal(int slot) {
|
||||
return AggUtil.uncorrectedVariance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
return AggUtil.variance(result[slot], sum[slot], counts[slot]); // calc once and cache in result?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -314,7 +314,7 @@ public class StatsComponentTest extends SolrTestCaseJ4 {
|
|||
, json ? "//*" : "count(//arr[@name='distinctValues']/*)=8"
|
||||
, "//double[@name='sumOfSquares'][.='53101.0']"
|
||||
, "//double[@name='mean'][.='1.125']"
|
||||
,json ? "//*" : "//double[@name='stddev'][.='87.08852228787508']" // SOLR-11725
|
||||
,"//double[@name='stddev'][.='87.08852228787508']"
|
||||
);
|
||||
|
||||
assertQ("test statistics values w/fq",
|
||||
|
@ -329,8 +329,13 @@ public class StatsComponentTest extends SolrTestCaseJ4 {
|
|||
, json ? "//*" : "count(//arr[@name='distinctValues']/*)=6"
|
||||
, "//double[@name='sumOfSquares'][.='43001.0']"
|
||||
, "//double[@name='mean'][.='19.833333333333332']"
|
||||
, json ? "//*" : "//double[@name='stddev'][.='90.15634568163611']" // SOLR-11725
|
||||
,"//double[@name='stddev'][.='90.15634568163611']"
|
||||
);
|
||||
|
||||
assertQ("test stdDev",
|
||||
req(baseParams, "q", "id:5", "rows", "0")
|
||||
,"//double[@name='stddev'][.='0.0']"
|
||||
);
|
||||
|
||||
if (!json) { // checking stats.facet makes no sense for json faceting
|
||||
assertQ("test stats.facet (using boolean facet field)",
|
||||
|
|
|
@ -235,10 +235,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
|
|||
assertEquals(1.0D, bucket.get("percentile"));
|
||||
assertEquals(0.475247524752475D, (double) bucket.get("avg"), 0.1E-7);
|
||||
assertEquals(54.0D, (double) bucket.get("sumsq"), 0.1E-7);
|
||||
// assertEquals(0.55846323792D, (double) bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
|
||||
// assertEquals(0.3118811881D, (double) bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
|
||||
assertEquals(0.55569169111D, (double) bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
|
||||
assertEquals(0.3087932556D, (double) bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
|
||||
assertEquals(0.55846323792D, (double) bucket.get("stddev"), 0.1E-7);
|
||||
assertEquals(0.3118811881D, (double) bucket.get("variance"), 0.1E-7);
|
||||
assertEquals(3L, bucket.get("unique"));
|
||||
assertEquals(3L, bucket.get("hll"));
|
||||
}
|
||||
|
@ -399,10 +397,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
|
|||
assertEquals(483.70000000000016D, (double)aaa0_Bucket.get("percentile"), 0.1E-7);
|
||||
assertEquals(115.5D, (double) aaa0_Bucket.get("avg"), 0.1E-7);
|
||||
assertEquals(1.674585E7D, (double) aaa0_Bucket.get("sumsq"), 0.1E-7);
|
||||
// assertEquals(206.4493184076D, (double) aaa0_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
|
||||
// assertEquals(42621.32107023412D, (double) aaa0_Bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
|
||||
assertEquals(206.1049489944D, (double) aaa0_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
|
||||
assertEquals(42479.25D, (double) aaa0_Bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
|
||||
assertEquals(206.4493184076D, (double) aaa0_Bucket.get("stddev"), 0.1E-7);
|
||||
assertEquals(42621.32107023412D, (double) aaa0_Bucket.get("variance"), 0.1E-7);
|
||||
assertEquals(284L, aaa0_Bucket.get("unique"));
|
||||
assertEquals(284L, aaa0_Bucket.get("hll"));
|
||||
|
||||
|
@ -418,10 +414,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
|
|||
assertEquals(1980.0D, tail_Bucket.get("sum"));
|
||||
assertEquals(22.0D, (double) tail_Bucket.get("avg"), 0.1E-7);
|
||||
assertEquals(58740.0D, (double) tail_Bucket.get("sumsq"), 0.1E-7);
|
||||
// assertEquals(13.0599310011D, (double) tail_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
|
||||
// assertEquals(170.5617977535D, (double) tail_Bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
|
||||
assertEquals(12.9871731592D, (double) tail_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
|
||||
assertEquals(168.666666667D, (double) tail_Bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
|
||||
assertEquals(13.0599310011D, (double) tail_Bucket.get("stddev"), 0.1E-7);
|
||||
assertEquals(170.5617977535D, (double) tail_Bucket.get("variance"), 0.1E-7);
|
||||
assertEquals(45L, tail_Bucket.get("unique"));
|
||||
assertEquals(45L, tail_Bucket.get("hll"));
|
||||
|
||||
|
@ -439,10 +433,8 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
|
|||
assertEquals(450.0D, tailB_Bucket.get("sum"));
|
||||
assertEquals(37.5D, (double) tailB_Bucket.get("avg"), 0.1E-7);
|
||||
assertEquals(16910.0D, (double) tailB_Bucket.get("sumsq"), 0.1E-7);
|
||||
// assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
|
||||
// assertEquals(3.1818181817D, (double) tailB_Bucket.get("variance"), 0.1E-7); // TODO: SOLR-11725
|
||||
assertEquals(1.70782513D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
|
||||
assertEquals(2.9166666747D, (double) tailB_Bucket.get("variance"), 0.1E-7); // json.facet is using the "uncorrected variance"
|
||||
assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7);
|
||||
assertEquals(3.1818181817D, (double) tailB_Bucket.get("variance"), 0.1E-7);
|
||||
assertEquals(6L, tailB_Bucket.get("unique"));
|
||||
assertEquals(6L, tailB_Bucket.get("hll"));
|
||||
|
||||
|
|
|
@ -1289,13 +1289,23 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
|||
", f2:{ 'buckets':[{ val:'B', count:3, x:11.0 }, { val:'A', count:2, x:4.0 }]} " +
|
||||
", f3:{ 'buckets':[{ val:'A', count:2, x:2 }, { val:'B', count:3, x:2 }]} " +
|
||||
", f4:{ 'buckets':[{ val:'A', count:2, x:2 }, { val:'B', count:3, x:2 }]} " +
|
||||
", f5:{ 'buckets':[{ val:'B', count:3, x:74.6666666666666 }, { val:'A', count:2, x:1.0 }]} " +
|
||||
", f5:{ 'buckets':[{ val:'B', count:3, x:112.0 }, { val:'A', count:2, x:2.0 }]} " +
|
||||
", f6:{ buckets:[{ val:-9.0, count:1, x:1 }]} " +
|
||||
", f7:{ buckets:[{ val:B, count:3, x:3 },{ val:A, count:2, x:0 }]} " +
|
||||
", f8:{ buckets:[{ val:A, count:2, x:2 },{ val:B, count:3, x:0 }]} " +
|
||||
"}"
|
||||
);
|
||||
|
||||
// test for stdDev and variance of size 1 and 0
|
||||
client.testJQ(params(p, "q", "id:1", "json.facet", "{n1:'stddev(${num_d})', n2: 'variance(${num_d})'}")
|
||||
, "facets=={ 'count':1, " +
|
||||
" n1:0.0, n2:0.0 }"
|
||||
);
|
||||
client.testJQ(params(p, "q", "id:3", "json.facet", "{n1:'stddev(${num_d})', n2: 'variance(${num_d})'}")
|
||||
, "facets=={ 'count':1, " +
|
||||
" n1:0.0, n2:0.0 }"
|
||||
);
|
||||
|
||||
// test sorting by stat with function
|
||||
client.testJQ(params(p, "q", "*:*"
|
||||
, "json.facet", "{f1:{terms:{${terms} field:'${cat_s}', sort:'n1 desc', facet:{n1:'avg(add(${num_d},${num_d}))'} }}" +
|
||||
|
@ -1808,7 +1818,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
|||
"sum1:3.0, sumsq1:247.0, avg1:0.6, avg2:0.5, mind:-9.0, maxd:11.0" +
|
||||
", numwhere:2, unique_num_i:4, unique_num_d:5, unique_date:5" +
|
||||
", where_hll:2, hll_num_i:4, hll_num_d:5, hll_date:5" +
|
||||
", med:2.0, perc:[-9.0,2.0,11.0], variance:49.04, stddev:7.002856560004639" +
|
||||
", med:2.0, perc:[-9.0,2.0,11.0], variance:61.3, stddev:7.829431652425353" +
|
||||
", mini:-5, maxi:7, missing:4, vals:2" +
|
||||
"}"
|
||||
);
|
||||
|
@ -1824,7 +1834,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
|||
, "facets=={ 'count':6, " +
|
||||
"sum1:0.0, sumsq1:51.5, avg1:0.0, mind:-5.0, maxd:3.0" +
|
||||
", mini:-5, maxi:3, mins:'a', maxs:'b'" +
|
||||
", stddev:2.537222891273055, variance:6.4375, median:0.0, perc:[-5.0,2.25,3.0]" +
|
||||
", stddev:2.712405363721075, variance:7.3571428571, median:0.0, perc:[-5.0,2.25,3.0]" +
|
||||
"}"
|
||||
);
|
||||
|
||||
|
@ -2314,8 +2324,8 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
|||
", sumd:'sum(${num_d})', avgd:'avg(${num_d})', variance:'variance(${num_d})', stddev:'stddev(${num_d})', missing:'missing(${multi_ss})', vals:'countvals(${multi_ss})'} }}"
|
||||
)
|
||||
, "facets=={ 'count':6, " +
|
||||
"'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, mini:-5, maxi:7, sumd:-3.0, avgd:-1.0, variance:74.66666666666667, stddev:8.640987597877148, missing:0, vals:5}," +
|
||||
" {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, mini:2, maxi:3, sumd:6.0, avgd:3.0, variance:1.0, stddev:1.0, missing:1, vals:1}] } } "
|
||||
"'f1':{ buckets:[{val:B, count:3, h:2, u:2, mind:-9.0, maxd:11.0, mini:-5, maxi:7, sumd:-3.0, avgd:-1.0, variance:112.0, stddev:10.583005244258363, missing:0, vals:5}," +
|
||||
" {val:A, count:2, h:2, u:2, mind:2.0, maxd:4.0, mini:2, maxi:3, sumd:6.0, avgd:3.0, variance:2.0, stddev:1.4142135623730951, missing:1, vals:1}] } } "
|
||||
|
||||
);
|
||||
|
||||
|
|
|
@ -90,6 +90,9 @@ _(raw; not yet edited)_
|
|||
This embedded zookeeper should not be used in production. If you rely
|
||||
upon the previous behavior, then you can change the clientPortAddress
|
||||
in solr/server/solr/zoo.cfg (Robert Muir)
|
||||
|
||||
* SOLR-11725: JSON aggregations uses corrected sample formula to compute standard deviation and variance.
|
||||
The computation of stdDev and variance in JSON aggregation is same as StatsComponent. (hossman, Munendra S N, yonik)
|
||||
|
||||
=== Upgrade Prerequisites in Solr 9
|
||||
|
||||
|
|
Loading…
Reference in New Issue