diff --git a/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc b/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc index fcb866cff26..b8ee0508618 100644 --- a/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc @@ -69,11 +69,70 @@ about `c * 8` bytes. The following chart shows how the error varies before and after the threshold: +//// +To generate this chart use this gnuplot script: +------- +#!/usr/bin/gnuplot +reset +set terminal png size 1000,400 + +set xlabel "Actual cardinality" +set logscale x + +set ylabel "Relative error (%)" +set yrange [0:8] + +set title "Cardinality error" +set grid + +set style data lines + +plot "test.dat" using 1:2 title "threshold=100", \ +"" using 1:3 title "threshold=1000", \ +"" using 1:4 title "threshold=10000" +# +------- + +and generate data in a 'test.dat' file using the below Java code: + +------- +private static double error(HyperLogLogPlusPlus h, long expected) { + double actual = h.cardinality(0); + return Math.abs(expected - actual) / expected; +} + +public static void main(String[] args) { + HyperLogLogPlusPlus h100 = new HyperLogLogPlusPlus(precisionFromThreshold(100), BigArrays.NON_RECYCLING_INSTANCE, 1); + HyperLogLogPlusPlus h1000 = new HyperLogLogPlusPlus(precisionFromThreshold(1000), BigArrays.NON_RECYCLING_INSTANCE, 1); + HyperLogLogPlusPlus h10000 = new HyperLogLogPlusPlus(precisionFromThreshold(10000), BigArrays.NON_RECYCLING_INSTANCE, 1); + + int next = 100; + int step = 10; + + for (int i = 1; i <= 10000000; ++i) { + long h = BitMixer.mix64(i); + h100.collect(0, h); + h1000.collect(0, h); + h10000.collect(0, h); + + if (i == next) { + System.out.println(i + " " + error(h100, i)*100 + " " + error(h1000, i)*100 + " " + error(h10000, i)*100); + next += step; + if (next >= 100 * step) { + step *= 10; + } + } + } +} +------- + +//// + image:images/cardinality_error.png[] For all 3 thresholds, counts have been accurate up to the configured threshold (although not guaranteed, this is likely to be the case). Please also note that -even with a threshold as low as 100, the error remains under 5%, even when +even with a threshold as low as 100, the error remains very low, even when counting millions of items. ==== Pre-computed hashes diff --git a/docs/reference/images/cardinality_error.png b/docs/reference/images/cardinality_error.png index 1f871c2c674..cf405be69ab 100644 Binary files a/docs/reference/images/cardinality_error.png and b/docs/reference/images/cardinality_error.png differ