diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index b036fe1b587..7c493239c0c 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -233,6 +233,10 @@ Bug Fixes * SOLR-7478: UpdateLog#close shuts down it's executor with interrupts before running it's close logic, possibly preventing a clean close. (Mark Miller) +* SOLR-7494: Facet Module - unique() facet function was wildly inaccurate for high cardinality + fields. (Andy Crossen, yonik) + + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java index f8816ea308f..69006610dde 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java @@ -386,17 +386,19 @@ abstract class UniqueSlotAcc extends SlotAcc { int maxExplicit=100; // TODO: make configurable // TODO: share values across buckets - if (unique <= maxExplicit) { + if (unique > 0) { + List lst = new ArrayList( Math.min(unique, maxExplicit) ); - if (ords != null) { - for (int ord=-1;;) { - if (++ord >= unique) break; + long maxOrd = ords.length(); + if (ords != null && ords.length() > 0) { + for (int ord=0; lst.size() < maxExplicit;) { ord = ords.nextSetBit(ord); if (ord == DocIdSetIterator.NO_MORE_DOCS) break; BytesRef val = lookupOrd(ord); Object o = field.getType().toObject(field, val); lst.add(o); + if (++ord >= maxOrd) break; } } @@ -555,43 +557,6 @@ class UniqueMultivaluedSlotAcc extends UniqueSlotAcc implements UnInvertedField. nTerms = uif.numTerms(); } - @Override - public Object getShardValue(int slot) throws IOException { - FixedBitSet ords = arr[slot]; - int unique; - if (counts != null) { - unique = counts[slot]; - } else { - unique = ords == null ? 0 : ords.cardinality(); - } - - SimpleOrderedMap map = new SimpleOrderedMap(); - map.add("unique", unique); - map.add("nTerms", nTerms); - - int maxExplicit=100; - // TODO: make configurable - // TODO: share values across buckets - if (unique <= maxExplicit) { - List lst = new ArrayList( Math.min(unique, maxExplicit) ); - - if (ords != null) { - for (int ord=-1;;) { - if (++ord >= unique) break; - ord = ords.nextSetBit(ord); - if (ord == DocIdSetIterator.NO_MORE_DOCS) break; - BytesRef val = docToTerm.lookupOrd(ord); - Object o = field.getType().toObject(field, val); - lst.add(o); - } - } - - map.add("vals", lst); - } - - return map; - } - @Override protected BytesRef lookupOrd(int ord) throws IOException { return docToTerm.lookupOrd(ord); diff --git a/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java b/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java index 131db3d53fe..efe4af723f5 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java @@ -67,9 +67,10 @@ public class UniqueAgg extends StrAggValueSource { } private static class Merger extends FacetSortableMerger { + long answer = -1; long sumUnique; Set values; - int shardsMissing; + long sumAdded; long shardsMissingSum; long shardsMissingMax; @@ -79,24 +80,35 @@ public class UniqueAgg extends StrAggValueSource { long unique = ((Number)map.get("unique")).longValue(); sumUnique += unique; - List vals = (List)map.get("vals"); + int valsListed = 0; + List vals = (List) map.get("vals"); if (vals != null) { if (values == null) { values = new HashSet<>(vals.size()*4); } values.addAll(vals); - } else { - shardsMissing++; - shardsMissingSum += unique; - shardsMissingMax = Math.max(shardsMissingMax, unique); + valsListed = vals.size(); + sumAdded += valsListed; } + shardsMissingSum += unique - valsListed; + shardsMissingMax = Math.max(shardsMissingMax, unique - valsListed); // TODO: somehow get & use the count in the bucket? } private long getLong() { - long exactCount = values == null ? 0 : values.size(); - return exactCount + shardsMissingSum; + if (answer >= 0) return answer; + answer = values == null ? 0 : values.size(); + if (answer == 0) { + // either a real "0", or no values returned from shards + answer = shardsMissingSum; + return answer; + } + + double factor = ((double)values.size()) / sumAdded; // what fraction of listed values were unique + long estimate = (long)(shardsMissingSum * factor); + answer = values.size() + estimate; + return answer; } @Override diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index bfb8a255fa2..2ae4d63598e 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -886,16 +886,76 @@ public class TestJsonFacets extends SolrTestCaseHS { } - @Test public void testDistrib() throws Exception { initServers(); - Client client = servers.getClient( random().nextInt() ); + Client client = servers.getClient(random().nextInt()); client.queryDefaults().set( "shards", servers.getShards() ); doStats( client, params() ); } + @Test + public void testBigger() throws Exception { + ModifiableSolrParams p = params("rows", "0", "cat_s", "cat_ss", "where_s", "where_ss"); + // doBigger(Client.localClient, p); + + initServers(); + Client client = servers.getClient(random().nextInt()); + client.queryDefaults().set( "shards", servers.getShards() ); + doBigger( client, p ); + } + + public void doBigger(Client client, ModifiableSolrParams p) throws Exception { + MacroExpander m = new MacroExpander(p.getMap()); + + String cat_s = m.expand("${cat_s}"); + String where_s = m.expand("${where_s}"); + + client.deleteByQuery("*:*", null); + + Random r = new Random(0); // make deterministic + int numCat = 1; + int numWhere = 2000000000; + int commitPercent = 10; + int ndocs=1000; + + Map>> model = new HashMap(); // cat->where->list + for (int i=0; i> sub = model.get(cat); + if (sub == null) { + sub = new HashMap<>(); + model.put(cat, sub); + } + List ids = sub.get(where); + if (ids == null) { + ids = new ArrayList<>(); + sub.put(where, ids); + } + ids.add(i); + + if (r.nextInt(100) < commitPercent) { + client.commit(); + } + } + + client.commit(); + + int sz = model.get(0).size(); + + client.testJQ(params(p, "q", "*:*" + , "json.facet", "{f1:{type:terms, field:${cat_s}, limit:2, facet:{x:'unique($where_s)'} }}" + ) + , "facets=={ 'count':" + ndocs + "," + + "'f1':{ 'buckets':[{ 'val':'0', 'count':" + ndocs + ", x:" + sz + " }]} } " + ); + } + + + public void XtestPercentiles() { AVLTreeDigest catA = new AVLTreeDigest(100); catA.add(4);