Add the ability to use the breadth_first mode with nested aggregations (such as `top_hits`) which require access to score information.
The score is recomputed lazily for each document belonging to a top bucket. Relates to #9825
This commit is contained in:
parent
247b5c8430
commit
052191f2a2
|
@ -638,31 +638,6 @@ public class Lucene {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a Scorer that throws an ElasticsearchIllegalStateException
|
|
||||||
* on all operations with the given message.
|
|
||||||
*/
|
|
||||||
public static Scorer illegalScorer(final String message) {
|
|
||||||
return new Scorer(null) {
|
|
||||||
@Override
|
|
||||||
public float score() throws IOException {
|
|
||||||
throw new IllegalStateException(message);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public int freq() throws IOException {
|
|
||||||
throw new IllegalStateException(message);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public int docID() {
|
|
||||||
throw new IllegalStateException(message);
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public DocIdSetIterator iterator() {
|
|
||||||
throw new IllegalStateException(message);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class CommitPoint extends IndexCommit {
|
private static final class CommitPoint extends IndexCommit {
|
||||||
private String segmentsFileName;
|
private String segmentsFileName;
|
||||||
private final Collection<String> files;
|
private final Collection<String> files;
|
||||||
|
|
|
@ -165,7 +165,7 @@ public abstract class AggregatorBase extends Aggregator {
|
||||||
public DeferringBucketCollector getDeferringCollector() {
|
public DeferringBucketCollector getDeferringCollector() {
|
||||||
// Default impl is a collector that selects the best buckets
|
// Default impl is a collector that selects the best buckets
|
||||||
// but an alternative defer policy may be based on best docs.
|
// but an alternative defer policy may be based on best docs.
|
||||||
return new BestBucketsDeferringCollector();
|
return new BestBucketsDeferringCollector(context());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -20,6 +20,9 @@
|
||||||
package org.elasticsearch.search.aggregations.bucket;
|
package org.elasticsearch.search.aggregations.bucket;
|
||||||
|
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.search.Scorer;
|
||||||
|
import org.apache.lucene.search.Weight;
|
||||||
import org.apache.lucene.util.packed.PackedInts;
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
import org.apache.lucene.util.packed.PackedLongValues;
|
import org.apache.lucene.util.packed.PackedLongValues;
|
||||||
import org.elasticsearch.common.lucene.Lucene;
|
import org.elasticsearch.common.lucene.Lucene;
|
||||||
|
@ -30,6 +33,7 @@ import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
|
||||||
import org.elasticsearch.search.aggregations.BucketCollector;
|
import org.elasticsearch.search.aggregations.BucketCollector;
|
||||||
import org.elasticsearch.search.aggregations.InternalAggregation;
|
import org.elasticsearch.search.aggregations.InternalAggregation;
|
||||||
import org.elasticsearch.search.aggregations.LeafBucketCollector;
|
import org.elasticsearch.search.aggregations.LeafBucketCollector;
|
||||||
|
import org.elasticsearch.search.aggregations.support.AggregationContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -56,6 +60,7 @@ public class BestBucketsDeferringCollector extends DeferringBucketCollector {
|
||||||
|
|
||||||
final List<Entry> entries = new ArrayList<>();
|
final List<Entry> entries = new ArrayList<>();
|
||||||
BucketCollector collector;
|
BucketCollector collector;
|
||||||
|
final AggregationContext aggContext;
|
||||||
LeafReaderContext context;
|
LeafReaderContext context;
|
||||||
PackedLongValues.Builder docDeltas;
|
PackedLongValues.Builder docDeltas;
|
||||||
PackedLongValues.Builder buckets;
|
PackedLongValues.Builder buckets;
|
||||||
|
@ -64,7 +69,8 @@ public class BestBucketsDeferringCollector extends DeferringBucketCollector {
|
||||||
LongHash selectedBuckets;
|
LongHash selectedBuckets;
|
||||||
|
|
||||||
/** Sole constructor. */
|
/** Sole constructor. */
|
||||||
public BestBucketsDeferringCollector() {
|
public BestBucketsDeferringCollector(AggregationContext context) {
|
||||||
|
this.aggContext = context;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -139,19 +145,34 @@ public class BestBucketsDeferringCollector extends DeferringBucketCollector {
|
||||||
this.selectedBuckets = hash;
|
this.selectedBuckets = hash;
|
||||||
|
|
||||||
collector.preCollection();
|
collector.preCollection();
|
||||||
if (collector.needsScores()) {
|
boolean needsScores = collector.needsScores();
|
||||||
throw new IllegalStateException("Cannot defer if scores are needed");
|
Weight weight = null;
|
||||||
|
if (needsScores) {
|
||||||
|
weight = aggContext.searchContext().searcher()
|
||||||
|
.createNormalizedWeight(aggContext.searchContext().query(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Entry entry : entries) {
|
for (Entry entry : entries) {
|
||||||
final LeafBucketCollector leafCollector = collector.getLeafCollector(entry.context);
|
final LeafBucketCollector leafCollector = collector.getLeafCollector(entry.context);
|
||||||
leafCollector.setScorer(Lucene.illegalScorer("A limitation of the " + SubAggCollectionMode.BREADTH_FIRST
|
DocIdSetIterator docIt = null;
|
||||||
+ " collection mode is that scores cannot be buffered along with document IDs"));
|
if (needsScores && entry.docDeltas.size() > 0) {
|
||||||
|
Scorer scorer = weight.scorer(entry.context);
|
||||||
|
// We don't need to check if the scorer is null
|
||||||
|
// since we are sure that there are documents to replay (entry.docDeltas it not empty).
|
||||||
|
docIt = scorer.iterator();
|
||||||
|
leafCollector.setScorer(scorer);
|
||||||
|
}
|
||||||
final PackedLongValues.Iterator docDeltaIterator = entry.docDeltas.iterator();
|
final PackedLongValues.Iterator docDeltaIterator = entry.docDeltas.iterator();
|
||||||
final PackedLongValues.Iterator buckets = entry.buckets.iterator();
|
final PackedLongValues.Iterator buckets = entry.buckets.iterator();
|
||||||
int doc = 0;
|
int doc = 0;
|
||||||
for (long i = 0, end = entry.docDeltas.size(); i < end; ++i) {
|
for (long i = 0, end = entry.docDeltas.size(); i < end; ++i) {
|
||||||
doc += docDeltaIterator.next();
|
doc += docDeltaIterator.next();
|
||||||
|
if (needsScores) {
|
||||||
|
if (docIt.docID() < doc) {
|
||||||
|
docIt.advance(doc);
|
||||||
|
}
|
||||||
|
// aggregations should only be replayed on matching documents
|
||||||
|
assert docIt.docID() == doc;
|
||||||
|
}
|
||||||
final long bucket = buckets.next();
|
final long bucket = buckets.next();
|
||||||
final long rebasedBucket = hash.find(bucket);
|
final long rebasedBucket = hash.find(bucket);
|
||||||
if (rebasedBucket != -1) {
|
if (rebasedBucket != -1) {
|
||||||
|
|
|
@ -199,7 +199,6 @@ public abstract class TermsAggregator extends BucketsAggregator {
|
||||||
@Override
|
@Override
|
||||||
protected boolean shouldDefer(Aggregator aggregator) {
|
protected boolean shouldDefer(Aggregator aggregator) {
|
||||||
return collectMode == SubAggCollectionMode.BREADTH_FIRST
|
return collectMode == SubAggCollectionMode.BREADTH_FIRST
|
||||||
&& aggregator.needsScores() == false
|
|
||||||
&& !aggsUsedForSorting.contains(aggregator);
|
&& !aggsUsedForSorting.contains(aggregator);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -351,8 +351,7 @@ public class TopHitsIT extends ESIntegTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void testBreadthFirst() throws Exception {
|
public void testBreadthFirstWithScoreNeeded() throws Exception {
|
||||||
// breadth_first will be ignored since we need scores
|
|
||||||
SearchResponse response = client().prepareSearch("idx").setTypes("type")
|
SearchResponse response = client().prepareSearch("idx").setTypes("type")
|
||||||
.addAggregation(terms("terms")
|
.addAggregation(terms("terms")
|
||||||
.executionHint(randomExecutionHint())
|
.executionHint(randomExecutionHint())
|
||||||
|
@ -382,6 +381,38 @@ public class TopHitsIT extends ESIntegTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testBreadthFirstWithAggOrderAndScoreNeeded() throws Exception {
|
||||||
|
SearchResponse response = client().prepareSearch("idx").setTypes("type")
|
||||||
|
.addAggregation(terms("terms")
|
||||||
|
.executionHint(randomExecutionHint())
|
||||||
|
.collectMode(SubAggCollectionMode.BREADTH_FIRST)
|
||||||
|
.field(TERMS_AGGS_FIELD)
|
||||||
|
.order(Terms.Order.aggregation("max", false))
|
||||||
|
.subAggregation(max("max").field(SORT_FIELD))
|
||||||
|
.subAggregation(topHits("hits").size(3))
|
||||||
|
).get();
|
||||||
|
|
||||||
|
assertSearchResponse(response);
|
||||||
|
|
||||||
|
Terms terms = response.getAggregations().get("terms");
|
||||||
|
assertThat(terms, notNullValue());
|
||||||
|
assertThat(terms.getName(), equalTo("terms"));
|
||||||
|
assertThat(terms.getBuckets().size(), equalTo(5));
|
||||||
|
int id = 4;
|
||||||
|
for (Terms.Bucket bucket : terms.getBuckets()) {
|
||||||
|
assertThat(bucket, notNullValue());
|
||||||
|
assertThat(key(bucket), equalTo("val" + id));
|
||||||
|
assertThat(bucket.getDocCount(), equalTo(10L));
|
||||||
|
TopHits topHits = bucket.getAggregations().get("hits");
|
||||||
|
SearchHits hits = topHits.getHits();
|
||||||
|
assertThat(hits.totalHits(), equalTo(10L));
|
||||||
|
assertThat(hits.getHits().length, equalTo(3));
|
||||||
|
|
||||||
|
assertThat(hits.getAt(0).sourceAsMap().size(), equalTo(4));
|
||||||
|
id --;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testBasicsGetProperty() throws Exception {
|
public void testBasicsGetProperty() throws Exception {
|
||||||
SearchResponse searchResponse = client().prepareSearch("idx").setQuery(matchAllQuery())
|
SearchResponse searchResponse = client().prepareSearch("idx").setQuery(matchAllQuery())
|
||||||
.addAggregation(global("global").subAggregation(topHits("hits"))).execute().actionGet();
|
.addAggregation(global("global").subAggregation(topHits("hits"))).execute().actionGet();
|
||||||
|
|
|
@ -164,7 +164,7 @@ public class ProfileTests extends ESTestCase {
|
||||||
final LeafCollector leafCollector = profileCollector.getLeafCollector(reader.leaves().get(0));
|
final LeafCollector leafCollector = profileCollector.getLeafCollector(reader.leaves().get(0));
|
||||||
assertThat(profileCollector.getTime(), greaterThan(0L));
|
assertThat(profileCollector.getTime(), greaterThan(0L));
|
||||||
long time = profileCollector.getTime();
|
long time = profileCollector.getTime();
|
||||||
leafCollector.setScorer(Lucene.illegalScorer("dummy scorer"));
|
leafCollector.setScorer(null);
|
||||||
assertThat(profileCollector.getTime(), greaterThan(time));
|
assertThat(profileCollector.getTime(), greaterThan(time));
|
||||||
time = profileCollector.getTime();
|
time = profileCollector.getTime();
|
||||||
leafCollector.collect(0);
|
leafCollector.collect(0);
|
||||||
|
|
|
@ -635,9 +635,8 @@ elasticsearch will always use the `depth_first` collect_mode unless explicitly i
|
||||||
Note that the `order` parameter can still be used to refer to data from a child aggregation when using the `breadth_first` setting - the parent
|
Note that the `order` parameter can still be used to refer to data from a child aggregation when using the `breadth_first` setting - the parent
|
||||||
aggregation understands that this child aggregation will need to be called first before any of the other child aggregations.
|
aggregation understands that this child aggregation will need to be called first before any of the other child aggregations.
|
||||||
|
|
||||||
WARNING: It is not possible to nest aggregations such as `top_hits` which require access to match score information under an aggregation that uses
|
WARNING: Nested aggregations such as `top_hits` which require access to score information under an aggregation that uses the `breadth_first`
|
||||||
the `breadth_first` collection mode. This is because this would require a RAM buffer to hold the float score value for every document and
|
collection mode need to replay the query on the second pass but only for the documents belonging to the top buckets.
|
||||||
this would typically be too costly in terms of RAM.
|
|
||||||
|
|
||||||
[[search-aggregations-bucket-terms-aggregation-execution-hint]]
|
[[search-aggregations-bucket-terms-aggregation-execution-hint]]
|
||||||
==== Execution hint
|
==== Execution hint
|
||||||
|
|
Loading…
Reference in New Issue