Aggregations: Add 'offset' option to histogram aggregation

Histogram aggregation supports an 'offset' option to move bucket boundaries. In a histogram with buckets of size X these can be moved from 0, X, 2X, 3X,... by an offset value of Y to Y, X+Y, 2X+Y, 3X+Y... by using the 'offset' option. The previous 'pre_offset' and 'post_offset' options are removed in favour of the simplified 'offset' option. Closes #9417 Closes #9505
2015-02-02 17:46:08 +01:00 · 2015-02-02 17:46:08 +01:00 · 44193e7ba5
parent 6c27f1242a
commit 44193e7ba5
5 changed files with 129 additions and 61 deletions
--- a/docs/reference/search/aggregations/bucket/histogram-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/histogram-aggregation.asciidoc
@ -6,7 +6,7 @@ It dynamically builds fixed size (a.k.a. interval) buckets over the values. For
 that holds a price (numeric), we can configure this aggregation to dynamically build buckets with interval `5`
 (in case of price it may represent $5). When the aggregation executes, the price field of every document will be
 evaluated and will be rounded down to its closest bucket - for example, if the price is `32` and the bucket size is `5`
-then the rounding will yield `30` and thus the document will "fall" into the bucket that is associated withe the key `30`.
+then the rounding will yield `30` and thus the document will "fall" into the bucket that is associated with the key `30`.
 To make this more formal, here is the rounding function that is used:

 [source,java]
@ -326,6 +326,15 @@ Here is an example of what the response could look like:
 }
 --------------------------------------------------

+==== Offset
+
+By default the bucket keys start with 0 and then continue in even spaced steps of `interval`, e.g. if the interval is 10 the first buckets
+(assuming there is data inside them) will be [0 - 9], [10-19], [20-29]. The bucket boundaries can be shifted by using the `offset` option.
+
+This can be best illustrated with an example. If there are 10 documents with values ranging from 5 to 14, using interval `10` will result in
+two buckets with 5 documents each. If an additional offset `5` is used, there will be only one single bucket [5-14] containing all the 10
+documents.
+
 ==== Response Format

 By default, the buckets are returned as an ordered array. It is also possible to request the response as a hash
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramBuilder.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramBuilder.java
@ -35,8 +35,7 @@ public class HistogramBuilder extends ValuesSourceAggregationBuilder<HistogramBu
    private Long minDocCount;
    private Long extendedBoundsMin;
    private Long extendedBoundsMax;
-    private Long preOffset;
-    private Long postOffset;
+    private Long offset;

    /**
     * Constructs a new histogram aggregation builder.
@ -92,18 +91,10 @@ public class HistogramBuilder extends ValuesSourceAggregationBuilder<HistogramBu
    }

    /**
-     * Set the offset to apply prior to computing buckets.
+     * Set the offset to apply to shift bucket boundaries.
     */
-    public HistogramBuilder preOffset(long preOffset) {
-        this.preOffset = preOffset;
-        return this;
-    }
-
-    /**
-     * Set the offset to apply after having computed buckets.
-     */
-    public HistogramBuilder postOffset(long postOffset) {
-        this.postOffset = postOffset;
+    public HistogramBuilder offset(long offset) {
+        this.offset = offset;
        return this;
    }

@ -119,15 +110,10 @@ public class HistogramBuilder extends ValuesSourceAggregationBuilder<HistogramBu
            order.toXContent(builder, params);
        }

-        if (preOffset != null) {
-            builder.field("pre_offset", preOffset);
+        if (offset != null) {
+            builder.field("offset", offset);
        }

-        if (postOffset != null) {
-            builder.field("post_offset", postOffset);
-        }
-
-
        if (minDocCount != null) {
            builder.field("min_doc_count", minDocCount);
        }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramParser.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramParser.java
@ -56,8 +56,7 @@ public class HistogramParser implements Aggregator.Parser {
        InternalOrder order = (InternalOrder) InternalOrder.KEY_ASC;
        long interval = -1;
        ExtendedBounds extendedBounds = null;
-        long preOffset = 0;
-        long postOffset = 0;
+        long offset = 0;

        XContentParser.Token token;
        String currentFieldName = null;
@ -73,10 +72,8 @@ public class HistogramParser implements Aggregator.Parser {
                    minDocCount = parser.longValue();
                } else if ("keyed".equals(currentFieldName)) {
                    keyed = parser.booleanValue();
-                } else if ("pre_offset".equals(currentFieldName) || "preOffset".equals(currentFieldName)) {
-                    preOffset = parser.longValue();
-                } else if ("post_offset".equals(currentFieldName) || "postOffset".equals(currentFieldName)) {
-                    postOffset = parser.longValue();
+                } else if ("offset".equals(currentFieldName)) {
+                    offset = parser.longValue();
                } else {
                    throw new SearchParseException(context, "Unknown key for a " + token + " in aggregation [" + aggregationName + "]: [" + currentFieldName + "].");
                }
@ -121,10 +118,10 @@ public class HistogramParser implements Aggregator.Parser {
        if (interval < 0) {
            throw new SearchParseException(context, "Missing required field [interval] for histogram aggregation [" + aggregationName + "]");
        }
-        
+
        Rounding rounding = new Rounding.Interval(interval);
-        if (preOffset != 0 || postOffset != 0) {
-            rounding = new Rounding.PrePostRounding((Rounding.Interval) rounding, preOffset, postOffset);
+        if (offset != 0) {
+            rounding = new Rounding.PrePostRounding((Rounding.Interval) rounding, -offset, offset);
        }

        if (extendedBounds != null) {
--- a/src/test/java/org/elasticsearch/common/rounding/RoundingTests.java
+++ b/src/test/java/org/elasticsearch/common/rounding/RoundingTests.java
@ -20,25 +20,82 @@
 package org.elasticsearch.common.rounding;

 import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;

 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.lessThanOrEqualTo;

-
-
 public class RoundingTests extends ElasticsearchTestCase {

+    /**
+     * simple testcase to ilustrate how Rounding.Interval works on readable input
+     */
+    @Test
    public void testInterval() {
+        int interval = 10;
+        Rounding.Interval rounding = new Rounding.Interval(interval);
+        int value = 24;
+        final long key = rounding.roundKey(24);
+        final long r = rounding.round(24);
+        String message = "round(" + value + ", interval=" + interval + ") = " + r;
+        assertEquals(value/interval, key);
+        assertEquals(value/interval * interval, r);
+        assertEquals(message, 0, r % interval);
+    }
+
+    @Test
+    public void testIntervalRandom() {
        final long interval = randomIntBetween(1, 100);
        Rounding.Interval rounding = new Rounding.Interval(interval);
        for (int i = 0; i < 1000; ++i) {
            long l = Math.max(randomLong(), Long.MIN_VALUE + interval);
+            final long key = rounding.roundKey(l);
            final long r = rounding.round(l);
            String message = "round(" + l + ", interval=" + interval + ") = " + r;
            assertEquals(message, 0, r % interval);
            assertThat(message, r, lessThanOrEqualTo(l));
            assertThat(message, r + interval, greaterThan(l));
+            assertEquals(message, r, key*interval);
        }
    }

+    /**
+     * Simple testcase to ilustrate how Rounding.Pre works on readable input.
+     * preOffset shifts input value before rounding (so here 24 -> 31)
+     * postOffset shifts rounded Value after rounding (here 30 -> 35)
+     */
+    @Test
+    public void testPrePostRounding() {
+        int interval = 10;
+        int value = 24;
+        int preOffset = 7;
+        int postOffset = 5;
+        Rounding.PrePostRounding rounding = new Rounding.PrePostRounding(new Rounding.Interval(interval), preOffset, postOffset);
+        final long key = rounding.roundKey(24);
+        final long roundedValue = rounding.round(24);
+        String message = "round(" + value + ", interval=" + interval + ") = " + roundedValue;
+        assertEquals(3, key);
+        assertEquals(35, roundedValue);
+        assertEquals(message, postOffset, roundedValue % interval);
+    }
+
+    @Test
+    public void testPrePostRoundingRandom() {
+        final long interval = randomIntBetween(1, 100);
+        Rounding.Interval internalRounding = new Rounding.Interval(interval);
+        final long preRounding = randomIntBetween(-100, 100);
+        final long postRounding = randomIntBetween(-100, 100);
+        Rounding.PrePostRounding  prePost = new Rounding.PrePostRounding(new Rounding.Interval(interval), preRounding, postRounding);
+        long safetyMargin = Math.abs(interval) + Math.abs(preRounding) + Math.abs(postRounding); // to prevent range overflow / underflow
+        for (int i = 0; i < 1000; ++i) {
+            long l = Math.max(randomLong() - safetyMargin, Long.MIN_VALUE + safetyMargin);
+            final long key = prePost.roundKey(l);
+            final long r = prePost.round(l);
+            String message = "round(" + l + ", interval=" + interval + ") = "+ r;
+            assertEquals(message, internalRounding.round(l+preRounding), r - postRounding);
+            assertThat(message, r - postRounding, lessThanOrEqualTo(l + preRounding));
+            assertThat(message, r + interval - postRounding, greaterThan(l + preRounding));
+            assertEquals(message, r, key*interval + postRounding);
+        }
+    }
 }
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/HistogramTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/HistogramTests.java
@ -125,7 +125,6 @@ public class HistogramTests extends ElasticsearchIntegrationTest {

        assertSearchResponse(response);

-
        Histogram histo = response.getAggregations().get("histo");
        assertThat(histo, notNullValue());
        assertThat(histo.getName(), equalTo("histo"));
@ -140,51 +139,71 @@ public class HistogramTests extends ElasticsearchIntegrationTest {
        }
    }

-    @Test
-    public void singleValuedField_withPreOffset() throws Exception {
-        long preOffsetMultiplier = randomIntBetween(2, 10);
-        SearchResponse response = client().prepareSearch("idx")
-                .addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval).preOffset(preOffsetMultiplier * interval))
+    public void singleValuedField_withOffset() throws Exception {
+        int interval1 = 10;
+        int offset = 5;
+        SearchResponse response = client()
+                .prepareSearch("idx")
+                .addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval1).offset(offset))
                .execute().actionGet();

-        assertSearchResponse(response);
-
-
+        // from setup we have between 6 and 20 documents, each with value 1 in test field
+        int expectedNumberOfBuckets = (offset >= (numDocs % interval + 1)) ? numValueBuckets   : numValueBuckets + 1;
        Histogram histo = response.getAggregations().get("histo");
        assertThat(histo, notNullValue());
        assertThat(histo.getName(), equalTo("histo"));
-        List<? extends Bucket> buckets = histo.getBuckets();
-        assertThat(buckets.size(), equalTo(numValueBuckets));
+        assertThat(histo.getBuckets().size(), equalTo(expectedNumberOfBuckets));

-        for (int i = 0; i < numValueBuckets; ++i) {
-            Histogram.Bucket bucket = buckets.get(i);
-            assertThat(bucket, notNullValue());
-            assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) (i + preOffsetMultiplier) * interval));
-            assertThat(bucket.getDocCount(), equalTo(valueCounts[i]));
-        }
+        // first bucket should start at -5, contain 4 documents
+        Histogram.Bucket bucket = histo.getBuckets().get(0);
+        assertThat(bucket, notNullValue());
+        assertThat(((Number) bucket.getKey()).longValue(), equalTo(-5L));
+        assertThat(bucket.getDocCount(), equalTo(4L));
+
+        // last bucket should have (numDocs % interval + 1) docs
+        bucket = histo.getBuckets().get(0);
+        assertThat(bucket, notNullValue());
+        assertThat(((Number) bucket.getKey()).longValue(), equalTo(numDocs%interval1 + 5L));
+        assertThat(bucket.getDocCount(), equalTo((numDocs % interval) + 1L));
    }

+    /**
+     * Shift buckets by random offset between [2..interval]. From setup we have 1 doc per values from 1..numdocs.
+     * Special care needs to be taken for expecations on counts in first and last bucket.
+     */
    @Test
-    public void singleValuedField_withPostOffset() throws Exception {
-        long postOffsetMultiplier = randomIntBetween(2, 10);
-        SearchResponse response = client().prepareSearch("idx")
-                .addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval).postOffset(postOffsetMultiplier * interval))
+    public void singleValuedField_withRandomOffset() throws Exception {
+        int offset = randomIntBetween(2, interval);
+        SearchResponse response = client()
+                .prepareSearch("idx")
+                .addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval).offset(offset))
                .execute().actionGet();
-
        assertSearchResponse(response);
-
+        // shifting by offset>2 creates new extra bucket [0,offset-1]
+        // if offset is >= number of values in original last bucket, that effect is canceled
+        int expectedNumberOfBuckets = (offset >= (numDocs % interval + 1)) ? numValueBuckets   : numValueBuckets + 1;

        Histogram histo = response.getAggregations().get("histo");
        assertThat(histo, notNullValue());
        assertThat(histo.getName(), equalTo("histo"));
-        List<? extends Bucket> buckets = histo.getBuckets();
-        assertThat(buckets.size(), equalTo(numValueBuckets));
+        assertThat(histo.getBuckets().size(), equalTo(expectedNumberOfBuckets));

-        for (int i = 0; i < numValueBuckets; ++i) {
-            Histogram.Bucket bucket = buckets.get(i);
+        int docsCounted = 0;
+        for (int i = 0; i < expectedNumberOfBuckets; ++i) {
+            Histogram.Bucket bucket = histo.getBuckets().get(i);
            assertThat(bucket, notNullValue());
-            assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) (i + postOffsetMultiplier) * interval));
-            assertThat(bucket.getDocCount(), equalTo(valueCounts[i]));
+            assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) ((i-1) * interval  + offset)));
+            if (i==0) {
+                // first bucket
+                long expectedFirstBucketCount = offset-1;
+                assertThat(bucket.getDocCount(), equalTo(expectedFirstBucketCount));
+                docsCounted += expectedFirstBucketCount;
+            } else if(i<expectedNumberOfBuckets-1) {
+                assertThat(bucket.getDocCount(), equalTo((long) interval));
+                docsCounted += interval;
+            } else {
+                assertThat(bucket.getDocCount(), equalTo((long) numDocs - docsCounted));
+            }
        }
    }