Aggregations: reduce histogram buckets on the fly using a priority queue.

This commit makes histogram reduction a bit cleaner by expecting buckets returned from shards to be sorted by key and merging them on-the-fly on the coordinating node using a priority queue. Close #8797
2014-12-11 10:27:47 +01:00 · 2014-12-11 10:27:47 +01:00 · 5694626f79
parent 5059d6fe1c
commit 5694626f79
2 changed files with 117 additions and 65 deletions
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/HistogramAggregator.java
@ -118,7 +118,8 @@ public class HistogramAggregator extends BucketsAggregator {
            buckets.add(histogramFactory.createBucket(rounding.valueForKey(bucketOrds.get(i)), bucketDocCount(i), bucketAggregations(i), keyed, formatter));
        }
-        CollectionUtil.introSort(buckets, order.comparator());
+        // the contract of the histogram aggregation is that shards must return buckets ordered by key in ascending order
        CollectionUtil.introSort(buckets, InternalOrder.KEY_ASC.comparator());
        // value source will be null for unmapped fields
        InternalHistogram.EmptyBucketInfo emptyBucketInfo = minDocCount == 0 ? new InternalHistogram.EmptyBucketInfo(rounding, buildEmptySubAggregations(), extendedBounds) : null;
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/InternalHistogram.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/histogram/InternalHistogram.java
@ -20,7 +20,9 @@ package org.elasticsearch.search.aggregations.bucket.histogram;
 import com.carrotsearch.hppc.LongObjectOpenHashMap;
 import com.google.common.collect.Lists;
 import org.apache.lucene.util.CollectionUtil;
 import org.apache.lucene.util.PriorityQueue;
 import org.elasticsearch.Version;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.io.stream.StreamInput;
@ -38,6 +40,7 @@ import org.elasticsearch.search.aggregations.support.format.ValueFormatterStream
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.ListIterator;
 import java.util.Map;
@ -289,37 +292,71 @@ public class InternalHistogram<B extends InternalHistogram.Bucket> extends Inter
        return FACTORY;
    }
-    @Override
+    private static class IteratorAndCurrent<B> {
-    public InternalAggregation reduce(ReduceContext reduceContext) {
+
        private final Iterator<B> iterator;
        private B current;
        IteratorAndCurrent(Iterator<B> iterator) {
            this.iterator = iterator;
            current = iterator.next();
        }
    }
    private List<B> reduceBuckets(ReduceContext reduceContext) {
        List<InternalAggregation> aggregations = reduceContext.aggregations();
-        LongObjectPagedHashMap<List<B>> bucketsByKey = new LongObjectPagedHashMap<>(reduceContext.bigArrays());
+        final PriorityQueue<IteratorAndCurrent<B>> pq = new PriorityQueue<IteratorAndCurrent<B>>(aggregations.size()) {
            @Override
            protected boolean lessThan(IteratorAndCurrent<B> a, IteratorAndCurrent<B> b) {
                return a.current.key < b.current.key;
            }
        };
        for (InternalAggregation aggregation : aggregations) {
            InternalHistogram<B> histogram = (InternalHistogram) aggregation;
-            for (B bucket : histogram.buckets) {
+            if (histogram.buckets.isEmpty() == false) {
-                List<B> bucketList = bucketsByKey.get(bucket.key);
+                pq.add(new IteratorAndCurrent<>(histogram.buckets.iterator()));
                if (bucketList == null) {
                    bucketList = new ArrayList<>(aggregations.size());
                    bucketsByKey.put(bucket.key, bucketList);
                }
                bucketList.add(bucket);
            }
        }
-        List<B> reducedBuckets = new ArrayList<>((int) bucketsByKey.size());
+        List<B> reducedBuckets = new ArrayList<>();
-        for (LongObjectPagedHashMap.Cursor<List<B>> cursor : bucketsByKey) {
+        if (pq.size() > 0) {
-            List<B> sameTermBuckets = cursor.value;
+            // list of buckets coming from different shards that have the same key
-            B bucket = sameTermBuckets.get(0).reduce(sameTermBuckets, reduceContext);
+            List<B> currentBuckets = new ArrayList<>();
-            if (bucket.getDocCount() >= minDocCount) {
+            long key = pq.top().current.key;
                reducedBuckets.add(bucket);
            }
        }
        bucketsByKey.close();
-        // adding empty buckets in needed
+            do {
-        if (minDocCount == 0) {
+                final IteratorAndCurrent<B> top = pq.top();
-            CollectionUtil.introSort(reducedBuckets, order.asc ? InternalOrder.KEY_ASC.comparator() : InternalOrder.KEY_DESC.comparator());
+
-            List<B> list = order.asc ? reducedBuckets : Lists.reverse(reducedBuckets);
+                if (top.current.key != key) {
                    // the key changes, reduce what we already buffered and reset the buffer for current buckets
                    reducedBuckets.add(currentBuckets.get(0).reduce(currentBuckets, reduceContext));
                    currentBuckets.clear();
                    key = top.current.key;
                }
                currentBuckets.add(top.current);
                if (top.iterator.hasNext()) {
                    final B next = top.iterator.next();
                    assert next.key > top.current.key : "shards must return data sorted by key";
                    top.current = next;
                    pq.updateTop();
                } else {
                    pq.pop();
                }
            } while (pq.size() > 0);
            if (currentBuckets.isEmpty() == false) {
                reducedBuckets.add(currentBuckets.get(0).reduce(currentBuckets, reduceContext));
            }
        }
        return reducedBuckets;
    }
    private void addEmptyBuckets(List<B> list) {
        B lastBucket = null;
        ExtendedBounds bounds = emptyBucketInfo.bounds;
        ListIterator<B> iter = list.listIterator();
@ -373,12 +410,26 @@ public class InternalHistogram<B extends InternalHistogram.Bucket> extends Inter
                key = emptyBucketInfo.rounding.nextRoundingValue(key);
            }
        }
            if (order != InternalOrder.KEY_ASC && order != InternalOrder.KEY_DESC) {
                CollectionUtil.introSort(reducedBuckets, order.comparator());
    }
    @Override
    public InternalAggregation reduce(ReduceContext reduceContext) {
        List<B> reducedBuckets = reduceBuckets(reduceContext);
        // adding empty buckets if needed
        if (minDocCount == 0) {
            addEmptyBuckets(reducedBuckets);
        }
        if (order == InternalOrder.KEY_ASC) {
            // nothing to do, data are already sorted since shards return
            // sorted buckets and the merge-sort performed by reduceBuckets
            // maintains order
        } else if (order == InternalOrder.KEY_DESC) {
            // we just need to reverse here...
            reducedBuckets = Lists.reverse(reducedBuckets);
        } else {
            // sorted by sub-aggregation, need to fall back to a costly n*log(n) sort
            CollectionUtil.introSort(reducedBuckets, order.comparator());
        }