mirror of https://github.com/apache/lucene.git
SOLR-10548: SOLR-10552: numBuckets should use hll and ignore mincount>1 filtering
This commit is contained in:
parent
114a65b331
commit
71ce0d31a6
|
@ -139,6 +139,10 @@ Upgrade Notes
|
||||||
|
|
||||||
* Solr contribs map-reduce, morphlines-core and morphlines-cell have been removed.
|
* Solr contribs map-reduce, morphlines-core and morphlines-cell have been removed.
|
||||||
|
|
||||||
|
* JSON Facet API now uses hyper-log-log for numBuckets cardinality calculation and
|
||||||
|
calculates cardinality before filtering buckets by any mincount greater than 1.
|
||||||
|
|
||||||
|
|
||||||
Detailed Change List
|
Detailed Change List
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
@ -192,6 +196,9 @@ Optimizations
|
||||||
* SOLR-9217: Reduced heap consumption for filter({!join ... score=...})
|
* SOLR-9217: Reduced heap consumption for filter({!join ... score=...})
|
||||||
(Andrey Kudryavtsev, Gopikannan Venugopalsamy via Mikhail Khludnev)
|
(Andrey Kudryavtsev, Gopikannan Venugopalsamy via Mikhail Khludnev)
|
||||||
|
|
||||||
|
* SOLR-10548: JSON Facet API now uses hyper-log-log++ for determining the number of buckets
|
||||||
|
when merging requests from a multi-shard distributed request. (yonik)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
* SOLR-10281: ADMIN_PATHS is duplicated in two places and inconsistent. This can cause automatic
|
* SOLR-10281: ADMIN_PATHS is duplicated in two places and inconsistent. This can cause automatic
|
||||||
|
@ -230,6 +237,11 @@ Bug Fixes
|
||||||
|
|
||||||
* SOLR-10493: Investigate SolrCloudExampleTest failures. (Erick Erickson)
|
* SOLR-10493: Investigate SolrCloudExampleTest failures. (Erick Erickson)
|
||||||
|
|
||||||
|
* SOLR-10552: JSON Facet API numBuckets was not consistent between distributed and non-distributed requests
|
||||||
|
when there was a mincount > 1. This has been corrected by changing numBuckets cardinality processing to
|
||||||
|
ignore mincount > 1 for non-distributed requests. (yonik)
|
||||||
|
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -82,7 +82,7 @@ public class FacetFieldMerger extends FacetRequestSortedMerger<FacetField> {
|
||||||
Object nb = facetResult.get("numBuckets");
|
Object nb = facetResult.get("numBuckets");
|
||||||
if (nb != null) {
|
if (nb != null) {
|
||||||
if (numBuckets == null) {
|
if (numBuckets == null) {
|
||||||
numBuckets = new FacetNumBucketsMerger();
|
numBuckets = new HLLAgg("hll_merger").createFacetMerger(nb);
|
||||||
}
|
}
|
||||||
numBuckets.merge(nb , mcontext);
|
numBuckets.merge(nb , mcontext);
|
||||||
}
|
}
|
||||||
|
@ -98,17 +98,7 @@ public class FacetFieldMerger extends FacetRequestSortedMerger<FacetField> {
|
||||||
SimpleOrderedMap result = new SimpleOrderedMap();
|
SimpleOrderedMap result = new SimpleOrderedMap();
|
||||||
|
|
||||||
if (numBuckets != null) {
|
if (numBuckets != null) {
|
||||||
int removed = 0;
|
result.add("numBuckets", ((Number)numBuckets.getMergedResult()).longValue());
|
||||||
if (freq.mincount > 1) {
|
|
||||||
for (FacetBucket bucket : buckets.values()) {
|
|
||||||
if (bucket.count < freq.mincount) removed++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result.add("numBuckets", ((Number)numBuckets.getMergedResult()).longValue() - removed);
|
|
||||||
|
|
||||||
// TODO: we can further increase this estimate.
|
|
||||||
// If not sorting by count, use a simple ratio to scale
|
|
||||||
// If sorting by count desc, then add up the highest_possible_missing_count from each shard
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sortBuckets();
|
sortBuckets();
|
||||||
|
|
|
@ -210,10 +210,6 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
||||||
IntFunction<Comparable> bucketValFromSlotNumFunc,
|
IntFunction<Comparable> bucketValFromSlotNumFunc,
|
||||||
Function<Comparable, String> fieldQueryValFunc) throws IOException {
|
Function<Comparable, String> fieldQueryValFunc) throws IOException {
|
||||||
int numBuckets = 0;
|
int numBuckets = 0;
|
||||||
List<Object> bucketVals = null;
|
|
||||||
if (freq.numBuckets && fcontext.isShard()) {
|
|
||||||
bucketVals = new ArrayList<>(100);
|
|
||||||
}
|
|
||||||
|
|
||||||
final int off = fcontext.isShard() ? 0 : (int) freq.offset;
|
final int off = fcontext.isShard() ? 0 : (int) freq.offset;
|
||||||
|
|
||||||
|
@ -257,16 +253,18 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
||||||
Slot bottom = null;
|
Slot bottom = null;
|
||||||
Slot scratchSlot = new Slot();
|
Slot scratchSlot = new Slot();
|
||||||
for (int slotNum = 0; slotNum < numSlots; slotNum++) {
|
for (int slotNum = 0; slotNum < numSlots; slotNum++) {
|
||||||
// screen out buckets not matching mincount immediately (i.e. don't even increment numBuckets)
|
|
||||||
if (effectiveMincount > 0 && countAcc.getCount(slotNum) < effectiveMincount) {
|
// screen out buckets not matching mincount
|
||||||
continue;
|
if (effectiveMincount > 0) {
|
||||||
|
int count = countAcc.getCount(slotNum);
|
||||||
|
if (count < effectiveMincount) {
|
||||||
|
if (count > 0)
|
||||||
|
numBuckets++; // Still increment numBuckets as long as we have some count. This is for consistency between distrib and non-distrib mode.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
numBuckets++;
|
numBuckets++;
|
||||||
if (bucketVals != null && bucketVals.size()<100) {
|
|
||||||
Object val = bucketValFromSlotNumFunc.apply(slotNum);
|
|
||||||
bucketVals.add(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (bottom != null) {
|
if (bottom != null) {
|
||||||
scratchSlot.slot = slotNum; // scratchSlot is only used to hold this slotNum for the following line
|
scratchSlot.slot = slotNum; // scratchSlot is only used to hold this slotNum for the following line
|
||||||
|
@ -292,10 +290,17 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
||||||
if (!fcontext.isShard()) {
|
if (!fcontext.isShard()) {
|
||||||
res.add("numBuckets", numBuckets);
|
res.add("numBuckets", numBuckets);
|
||||||
} else {
|
} else {
|
||||||
SimpleOrderedMap<Object> map = new SimpleOrderedMap<>(2);
|
DocSet domain = fcontext.base;
|
||||||
map.add("numBuckets", numBuckets);
|
if (freq.prefix != null) {
|
||||||
map.add("vals", bucketVals);
|
Query prefixFilter = sf.getType().getPrefixQuery(null, sf, freq.prefix);
|
||||||
res.add("numBuckets", map);
|
domain = fcontext.searcher.getDocSet(prefixFilter, domain);
|
||||||
|
}
|
||||||
|
|
||||||
|
HLLAgg agg = new HLLAgg(freq.field);
|
||||||
|
SlotAcc acc = agg.createSlotAcc(fcontext, domain.size(), 1);
|
||||||
|
acc.collect(domain, 0);
|
||||||
|
acc.key = "numBuckets";
|
||||||
|
acc.setValues(res, 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -522,7 +527,6 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
||||||
"all",
|
"all",
|
||||||
{"cat3":{"_l":["A"]}}]]},
|
{"cat3":{"_l":["A"]}}]]},
|
||||||
"cat1":{"_l":["A"]}}}
|
"cat1":{"_l":["A"]}}}
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static <T> List<T> asList(Object list) {
|
static <T> List<T> asList(Object list) {
|
||||||
|
|
|
@ -770,12 +770,12 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
||||||
"'f1':{ numBuckets:1, buckets:[{val:B, count:3}]} } "
|
"'f1':{ numBuckets:1, buckets:[{val:B, count:3}]} } "
|
||||||
);
|
);
|
||||||
|
|
||||||
// mincount should lower numBuckets
|
// mincount should not lower numBuckets (since SOLR-10552)
|
||||||
client.testJQ(params(p, "q", "*:*", "rows", "0", "facet", "true"
|
client.testJQ(params(p, "q", "*:*", "rows", "0", "facet", "true"
|
||||||
, "json.facet", "{f1:{terms:{${terms} field:${cat_s}, numBuckets:true, mincount:3}}}"
|
, "json.facet", "{f1:{terms:{${terms} field:${cat_s}, numBuckets:true, mincount:3}}}"
|
||||||
)
|
)
|
||||||
, "facets=={ 'count':6, " +
|
, "facets=={ 'count':6, " +
|
||||||
"'f1':{ numBuckets:1, buckets:[{val:B, count:3}]} } "
|
"'f1':{ numBuckets:2, buckets:[{val:B, count:3}]} } "
|
||||||
);
|
);
|
||||||
|
|
||||||
// basic range facet
|
// basic range facet
|
||||||
|
@ -1136,7 +1136,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
||||||
",f3:{${terms} type:field, field:${num_i}, sort:'index asc' }" +
|
",f3:{${terms} type:field, field:${num_i}, sort:'index asc' }" +
|
||||||
",f4:{${terms} type:field, field:${num_i}, sort:'index desc' }" +
|
",f4:{${terms} type:field, field:${num_i}, sort:'index desc' }" +
|
||||||
",f5:{${terms} type:field, field:${num_i}, sort:'index desc', limit:1, missing:true, allBuckets:true, numBuckets:true }" +
|
",f5:{${terms} type:field, field:${num_i}, sort:'index desc', limit:1, missing:true, allBuckets:true, numBuckets:true }" +
|
||||||
",f6:{${terms} type:field, field:${num_i}, sort:'index desc', mincount:2, numBuckets:true }" + // mincount should lower numbuckets
|
",f6:{${terms} type:field, field:${num_i}, sort:'index desc', mincount:2, numBuckets:true }" + // mincount should not lower numbuckets (since SOLR-10552)
|
||||||
",f7:{${terms} type:field, field:${num_i}, sort:'index desc', offset:2, numBuckets:true }" + // test offset
|
",f7:{${terms} type:field, field:${num_i}, sort:'index desc', offset:2, numBuckets:true }" + // test offset
|
||||||
",f8:{${terms} type:field, field:${num_i}, sort:'index desc', offset:100, numBuckets:true }" + // test high offset
|
",f8:{${terms} type:field, field:${num_i}, sort:'index desc', offset:100, numBuckets:true }" + // test high offset
|
||||||
",f9:{${terms} type:field, field:${num_i}, sort:'x desc', facet:{x:'avg(${num_d})'}, missing:true, allBuckets:true, numBuckets:true }" + // test stats
|
",f9:{${terms} type:field, field:${num_i}, sort:'x desc', facet:{x:'avg(${num_d})'}, missing:true, allBuckets:true, numBuckets:true }" + // test stats
|
||||||
|
@ -1150,7 +1150,7 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
||||||
",f3:{ buckets:[{val:-5,count:2},{val:2,count:1},{val:3,count:1},{val:7,count:1} ] } " +
|
",f3:{ buckets:[{val:-5,count:2},{val:2,count:1},{val:3,count:1},{val:7,count:1} ] } " +
|
||||||
",f4:{ buckets:[{val:7,count:1},{val:3,count:1},{val:2,count:1},{val:-5,count:2} ] } " +
|
",f4:{ buckets:[{val:7,count:1},{val:3,count:1},{val:2,count:1},{val:-5,count:2} ] } " +
|
||||||
",f5:{ buckets:[{val:7,count:1}] , numBuckets:4, allBuckets:{count:5}, missing:{count:1} } " +
|
",f5:{ buckets:[{val:7,count:1}] , numBuckets:4, allBuckets:{count:5}, missing:{count:1} } " +
|
||||||
",f6:{ buckets:[{val:-5,count:2}] , numBuckets:1 } " +
|
",f6:{ buckets:[{val:-5,count:2}] , numBuckets:4 } " +
|
||||||
",f7:{ buckets:[{val:2,count:1},{val:-5,count:2}] , numBuckets:4 } " +
|
",f7:{ buckets:[{val:2,count:1},{val:-5,count:2}] , numBuckets:4 } " +
|
||||||
",f8:{ buckets:[] , numBuckets:4 } " +
|
",f8:{ buckets:[] , numBuckets:4 } " +
|
||||||
",f9:{ buckets:[{val:7,count:1,x:11.0},{val:2,count:1,x:4.0},{val:3,count:1,x:2.0},{val:-5,count:2,x:-7.0} ], numBuckets:4, allBuckets:{count:5,x:0.6},missing:{count:1,x:0.0} } " + // TODO: should missing exclude "x" because no values were collected?
|
",f9:{ buckets:[{val:7,count:1,x:11.0},{val:2,count:1,x:4.0},{val:3,count:1,x:2.0},{val:-5,count:2,x:-7.0} ], numBuckets:4, allBuckets:{count:5,x:0.6},missing:{count:1,x:0.0} } " + // TODO: should missing exclude "x" because no values were collected?
|
||||||
|
|
Loading…
Reference in New Issue