Aggregations: Add 'offset' option to histogram aggregation

Histogram aggregation supports an 'offset' option to move bucket boundaries.
In a histogram with buckets of size X these can be moved from 0, X, 2X, 3X,...
by an offset value of Y to Y, X+Y, 2X+Y, 3X+Y... by using the 'offset' option.
The previous 'pre_offset' and 'post_offset' options are removed in favour of
the simplified 'offset' option.

Closes #9417
Closes #9505
This commit is contained in:
Christoph Büscher 2015-02-02 17:46:08 +01:00
parent 6c27f1242a
commit 44193e7ba5
5 changed files with 129 additions and 61 deletions

View File

@ -6,7 +6,7 @@ It dynamically builds fixed size (a.k.a. interval) buckets over the values. For
that holds a price (numeric), we can configure this aggregation to dynamically build buckets with interval `5`
(in case of price it may represent $5). When the aggregation executes, the price field of every document will be
evaluated and will be rounded down to its closest bucket - for example, if the price is `32` and the bucket size is `5`
then the rounding will yield `30` and thus the document will "fall" into the bucket that is associated withe the key `30`.
then the rounding will yield `30` and thus the document will "fall" into the bucket that is associated with the key `30`.
To make this more formal, here is the rounding function that is used:
[source,java]
@ -326,6 +326,15 @@ Here is an example of what the response could look like:
}
--------------------------------------------------
==== Offset
By default the bucket keys start with 0 and then continue in even spaced steps of `interval`, e.g. if the interval is 10 the first buckets
(assuming there is data inside them) will be [0 - 9], [10-19], [20-29]. The bucket boundaries can be shifted by using the `offset` option.
This can be best illustrated with an example. If there are 10 documents with values ranging from 5 to 14, using interval `10` will result in
two buckets with 5 documents each. If an additional offset `5` is used, there will be only one single bucket [5-14] containing all the 10
documents.
==== Response Format
By default, the buckets are returned as an ordered array. It is also possible to request the response as a hash

View File

@ -35,8 +35,7 @@ public class HistogramBuilder extends ValuesSourceAggregationBuilder<HistogramBu
private Long minDocCount;
private Long extendedBoundsMin;
private Long extendedBoundsMax;
private Long preOffset;
private Long postOffset;
private Long offset;
/**
* Constructs a new histogram aggregation builder.
@ -92,18 +91,10 @@ public class HistogramBuilder extends ValuesSourceAggregationBuilder<HistogramBu
}
/**
* Set the offset to apply prior to computing buckets.
* Set the offset to apply to shift bucket boundaries.
*/
public HistogramBuilder preOffset(long preOffset) {
this.preOffset = preOffset;
return this;
}
/**
* Set the offset to apply after having computed buckets.
*/
public HistogramBuilder postOffset(long postOffset) {
this.postOffset = postOffset;
public HistogramBuilder offset(long offset) {
this.offset = offset;
return this;
}
@ -119,15 +110,10 @@ public class HistogramBuilder extends ValuesSourceAggregationBuilder<HistogramBu
order.toXContent(builder, params);
}
if (preOffset != null) {
builder.field("pre_offset", preOffset);
if (offset != null) {
builder.field("offset", offset);
}
if (postOffset != null) {
builder.field("post_offset", postOffset);
}
if (minDocCount != null) {
builder.field("min_doc_count", minDocCount);
}

View File

@ -56,8 +56,7 @@ public class HistogramParser implements Aggregator.Parser {
InternalOrder order = (InternalOrder) InternalOrder.KEY_ASC;
long interval = -1;
ExtendedBounds extendedBounds = null;
long preOffset = 0;
long postOffset = 0;
long offset = 0;
XContentParser.Token token;
String currentFieldName = null;
@ -73,10 +72,8 @@ public class HistogramParser implements Aggregator.Parser {
minDocCount = parser.longValue();
} else if ("keyed".equals(currentFieldName)) {
keyed = parser.booleanValue();
} else if ("pre_offset".equals(currentFieldName) || "preOffset".equals(currentFieldName)) {
preOffset = parser.longValue();
} else if ("post_offset".equals(currentFieldName) || "postOffset".equals(currentFieldName)) {
postOffset = parser.longValue();
} else if ("offset".equals(currentFieldName)) {
offset = parser.longValue();
} else {
throw new SearchParseException(context, "Unknown key for a " + token + " in aggregation [" + aggregationName + "]: [" + currentFieldName + "].");
}
@ -121,10 +118,10 @@ public class HistogramParser implements Aggregator.Parser {
if (interval < 0) {
throw new SearchParseException(context, "Missing required field [interval] for histogram aggregation [" + aggregationName + "]");
}
Rounding rounding = new Rounding.Interval(interval);
if (preOffset != 0 || postOffset != 0) {
rounding = new Rounding.PrePostRounding((Rounding.Interval) rounding, preOffset, postOffset);
if (offset != 0) {
rounding = new Rounding.PrePostRounding((Rounding.Interval) rounding, -offset, offset);
}
if (extendedBounds != null) {

View File

@ -20,25 +20,82 @@
package org.elasticsearch.common.rounding;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
public class RoundingTests extends ElasticsearchTestCase {
/**
* simple testcase to ilustrate how Rounding.Interval works on readable input
*/
@Test
public void testInterval() {
int interval = 10;
Rounding.Interval rounding = new Rounding.Interval(interval);
int value = 24;
final long key = rounding.roundKey(24);
final long r = rounding.round(24);
String message = "round(" + value + ", interval=" + interval + ") = " + r;
assertEquals(value/interval, key);
assertEquals(value/interval * interval, r);
assertEquals(message, 0, r % interval);
}
@Test
public void testIntervalRandom() {
final long interval = randomIntBetween(1, 100);
Rounding.Interval rounding = new Rounding.Interval(interval);
for (int i = 0; i < 1000; ++i) {
long l = Math.max(randomLong(), Long.MIN_VALUE + interval);
final long key = rounding.roundKey(l);
final long r = rounding.round(l);
String message = "round(" + l + ", interval=" + interval + ") = " + r;
assertEquals(message, 0, r % interval);
assertThat(message, r, lessThanOrEqualTo(l));
assertThat(message, r + interval, greaterThan(l));
assertEquals(message, r, key*interval);
}
}
/**
* Simple testcase to ilustrate how Rounding.Pre works on readable input.
* preOffset shifts input value before rounding (so here 24 -> 31)
* postOffset shifts rounded Value after rounding (here 30 -> 35)
*/
@Test
public void testPrePostRounding() {
int interval = 10;
int value = 24;
int preOffset = 7;
int postOffset = 5;
Rounding.PrePostRounding rounding = new Rounding.PrePostRounding(new Rounding.Interval(interval), preOffset, postOffset);
final long key = rounding.roundKey(24);
final long roundedValue = rounding.round(24);
String message = "round(" + value + ", interval=" + interval + ") = " + roundedValue;
assertEquals(3, key);
assertEquals(35, roundedValue);
assertEquals(message, postOffset, roundedValue % interval);
}
@Test
public void testPrePostRoundingRandom() {
final long interval = randomIntBetween(1, 100);
Rounding.Interval internalRounding = new Rounding.Interval(interval);
final long preRounding = randomIntBetween(-100, 100);
final long postRounding = randomIntBetween(-100, 100);
Rounding.PrePostRounding prePost = new Rounding.PrePostRounding(new Rounding.Interval(interval), preRounding, postRounding);
long safetyMargin = Math.abs(interval) + Math.abs(preRounding) + Math.abs(postRounding); // to prevent range overflow / underflow
for (int i = 0; i < 1000; ++i) {
long l = Math.max(randomLong() - safetyMargin, Long.MIN_VALUE + safetyMargin);
final long key = prePost.roundKey(l);
final long r = prePost.round(l);
String message = "round(" + l + ", interval=" + interval + ") = "+ r;
assertEquals(message, internalRounding.round(l+preRounding), r - postRounding);
assertThat(message, r - postRounding, lessThanOrEqualTo(l + preRounding));
assertThat(message, r + interval - postRounding, greaterThan(l + preRounding));
assertEquals(message, r, key*interval + postRounding);
}
}
}

View File

@ -125,7 +125,6 @@ public class HistogramTests extends ElasticsearchIntegrationTest {
assertSearchResponse(response);
Histogram histo = response.getAggregations().get("histo");
assertThat(histo, notNullValue());
assertThat(histo.getName(), equalTo("histo"));
@ -140,51 +139,71 @@ public class HistogramTests extends ElasticsearchIntegrationTest {
}
}
@Test
public void singleValuedField_withPreOffset() throws Exception {
long preOffsetMultiplier = randomIntBetween(2, 10);
SearchResponse response = client().prepareSearch("idx")
.addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval).preOffset(preOffsetMultiplier * interval))
public void singleValuedField_withOffset() throws Exception {
int interval1 = 10;
int offset = 5;
SearchResponse response = client()
.prepareSearch("idx")
.addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval1).offset(offset))
.execute().actionGet();
assertSearchResponse(response);
// from setup we have between 6 and 20 documents, each with value 1 in test field
int expectedNumberOfBuckets = (offset >= (numDocs % interval + 1)) ? numValueBuckets : numValueBuckets + 1;
Histogram histo = response.getAggregations().get("histo");
assertThat(histo, notNullValue());
assertThat(histo.getName(), equalTo("histo"));
List<? extends Bucket> buckets = histo.getBuckets();
assertThat(buckets.size(), equalTo(numValueBuckets));
assertThat(histo.getBuckets().size(), equalTo(expectedNumberOfBuckets));
for (int i = 0; i < numValueBuckets; ++i) {
Histogram.Bucket bucket = buckets.get(i);
assertThat(bucket, notNullValue());
assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) (i + preOffsetMultiplier) * interval));
assertThat(bucket.getDocCount(), equalTo(valueCounts[i]));
}
// first bucket should start at -5, contain 4 documents
Histogram.Bucket bucket = histo.getBuckets().get(0);
assertThat(bucket, notNullValue());
assertThat(((Number) bucket.getKey()).longValue(), equalTo(-5L));
assertThat(bucket.getDocCount(), equalTo(4L));
// last bucket should have (numDocs % interval + 1) docs
bucket = histo.getBuckets().get(0);
assertThat(bucket, notNullValue());
assertThat(((Number) bucket.getKey()).longValue(), equalTo(numDocs%interval1 + 5L));
assertThat(bucket.getDocCount(), equalTo((numDocs % interval) + 1L));
}
/**
* Shift buckets by random offset between [2..interval]. From setup we have 1 doc per values from 1..numdocs.
* Special care needs to be taken for expecations on counts in first and last bucket.
*/
@Test
public void singleValuedField_withPostOffset() throws Exception {
long postOffsetMultiplier = randomIntBetween(2, 10);
SearchResponse response = client().prepareSearch("idx")
.addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval).postOffset(postOffsetMultiplier * interval))
public void singleValuedField_withRandomOffset() throws Exception {
int offset = randomIntBetween(2, interval);
SearchResponse response = client()
.prepareSearch("idx")
.addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).interval(interval).offset(offset))
.execute().actionGet();
assertSearchResponse(response);
// shifting by offset>2 creates new extra bucket [0,offset-1]
// if offset is >= number of values in original last bucket, that effect is canceled
int expectedNumberOfBuckets = (offset >= (numDocs % interval + 1)) ? numValueBuckets : numValueBuckets + 1;
Histogram histo = response.getAggregations().get("histo");
assertThat(histo, notNullValue());
assertThat(histo.getName(), equalTo("histo"));
List<? extends Bucket> buckets = histo.getBuckets();
assertThat(buckets.size(), equalTo(numValueBuckets));
assertThat(histo.getBuckets().size(), equalTo(expectedNumberOfBuckets));
for (int i = 0; i < numValueBuckets; ++i) {
Histogram.Bucket bucket = buckets.get(i);
int docsCounted = 0;
for (int i = 0; i < expectedNumberOfBuckets; ++i) {
Histogram.Bucket bucket = histo.getBuckets().get(i);
assertThat(bucket, notNullValue());
assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) (i + postOffsetMultiplier) * interval));
assertThat(bucket.getDocCount(), equalTo(valueCounts[i]));
assertThat(((Number) bucket.getKey()).longValue(), equalTo((long) ((i-1) * interval + offset)));
if (i==0) {
// first bucket
long expectedFirstBucketCount = offset-1;
assertThat(bucket.getDocCount(), equalTo(expectedFirstBucketCount));
docsCounted += expectedFirstBucketCount;
} else if(i<expectedNumberOfBuckets-1) {
assertThat(bucket.getDocCount(), equalTo((long) interval));
docsCounted += interval;
} else {
assertThat(bucket.getDocCount(), equalTo((long) numDocs - docsCounted));
}
}
}