mirror of https://github.com/apache/lucene.git
SOLR-7494: fix unique() function for high cardinality fields
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1677091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f829d6be3e
commit
cc371b4cdf
|
@ -233,6 +233,10 @@ Bug Fixes
|
|||
* SOLR-7478: UpdateLog#close shuts down it's executor with interrupts before running it's close logic,
|
||||
possibly preventing a clean close. (Mark Miller)
|
||||
|
||||
* SOLR-7494: Facet Module - unique() facet function was wildly inaccurate for high cardinality
|
||||
fields. (Andy Crossen, yonik)
|
||||
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -386,17 +386,19 @@ abstract class UniqueSlotAcc extends SlotAcc {
|
|||
int maxExplicit=100;
|
||||
// TODO: make configurable
|
||||
// TODO: share values across buckets
|
||||
if (unique <= maxExplicit) {
|
||||
if (unique > 0) {
|
||||
|
||||
List lst = new ArrayList( Math.min(unique, maxExplicit) );
|
||||
|
||||
if (ords != null) {
|
||||
for (int ord=-1;;) {
|
||||
if (++ord >= unique) break;
|
||||
long maxOrd = ords.length();
|
||||
if (ords != null && ords.length() > 0) {
|
||||
for (int ord=0; lst.size() < maxExplicit;) {
|
||||
ord = ords.nextSetBit(ord);
|
||||
if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
|
||||
BytesRef val = lookupOrd(ord);
|
||||
Object o = field.getType().toObject(field, val);
|
||||
lst.add(o);
|
||||
if (++ord >= maxOrd) break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -555,43 +557,6 @@ class UniqueMultivaluedSlotAcc extends UniqueSlotAcc implements UnInvertedField.
|
|||
nTerms = uif.numTerms();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getShardValue(int slot) throws IOException {
|
||||
FixedBitSet ords = arr[slot];
|
||||
int unique;
|
||||
if (counts != null) {
|
||||
unique = counts[slot];
|
||||
} else {
|
||||
unique = ords == null ? 0 : ords.cardinality();
|
||||
}
|
||||
|
||||
SimpleOrderedMap map = new SimpleOrderedMap();
|
||||
map.add("unique", unique);
|
||||
map.add("nTerms", nTerms);
|
||||
|
||||
int maxExplicit=100;
|
||||
// TODO: make configurable
|
||||
// TODO: share values across buckets
|
||||
if (unique <= maxExplicit) {
|
||||
List lst = new ArrayList( Math.min(unique, maxExplicit) );
|
||||
|
||||
if (ords != null) {
|
||||
for (int ord=-1;;) {
|
||||
if (++ord >= unique) break;
|
||||
ord = ords.nextSetBit(ord);
|
||||
if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
|
||||
BytesRef val = docToTerm.lookupOrd(ord);
|
||||
Object o = field.getType().toObject(field, val);
|
||||
lst.add(o);
|
||||
}
|
||||
}
|
||||
|
||||
map.add("vals", lst);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef lookupOrd(int ord) throws IOException {
|
||||
return docToTerm.lookupOrd(ord);
|
||||
|
|
|
@ -67,9 +67,10 @@ public class UniqueAgg extends StrAggValueSource {
|
|||
}
|
||||
|
||||
private static class Merger extends FacetSortableMerger {
|
||||
long answer = -1;
|
||||
long sumUnique;
|
||||
Set<Object> values;
|
||||
int shardsMissing;
|
||||
long sumAdded;
|
||||
long shardsMissingSum;
|
||||
long shardsMissingMax;
|
||||
|
||||
|
@ -79,24 +80,35 @@ public class UniqueAgg extends StrAggValueSource {
|
|||
long unique = ((Number)map.get("unique")).longValue();
|
||||
sumUnique += unique;
|
||||
|
||||
List vals = (List)map.get("vals");
|
||||
int valsListed = 0;
|
||||
List vals = (List) map.get("vals");
|
||||
if (vals != null) {
|
||||
if (values == null) {
|
||||
values = new HashSet<>(vals.size()*4);
|
||||
}
|
||||
values.addAll(vals);
|
||||
} else {
|
||||
shardsMissing++;
|
||||
shardsMissingSum += unique;
|
||||
shardsMissingMax = Math.max(shardsMissingMax, unique);
|
||||
valsListed = vals.size();
|
||||
sumAdded += valsListed;
|
||||
}
|
||||
|
||||
shardsMissingSum += unique - valsListed;
|
||||
shardsMissingMax = Math.max(shardsMissingMax, unique - valsListed);
|
||||
// TODO: somehow get & use the count in the bucket?
|
||||
}
|
||||
|
||||
private long getLong() {
|
||||
long exactCount = values == null ? 0 : values.size();
|
||||
return exactCount + shardsMissingSum;
|
||||
if (answer >= 0) return answer;
|
||||
answer = values == null ? 0 : values.size();
|
||||
if (answer == 0) {
|
||||
// either a real "0", or no values returned from shards
|
||||
answer = shardsMissingSum;
|
||||
return answer;
|
||||
}
|
||||
|
||||
double factor = ((double)values.size()) / sumAdded; // what fraction of listed values were unique
|
||||
long estimate = (long)(shardsMissingSum * factor);
|
||||
answer = values.size() + estimate;
|
||||
return answer;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -886,16 +886,76 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDistrib() throws Exception {
|
||||
initServers();
|
||||
Client client = servers.getClient( random().nextInt() );
|
||||
Client client = servers.getClient(random().nextInt());
|
||||
client.queryDefaults().set( "shards", servers.getShards() );
|
||||
doStats( client, params() );
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testBigger() throws Exception {
|
||||
ModifiableSolrParams p = params("rows", "0", "cat_s", "cat_ss", "where_s", "where_ss");
|
||||
// doBigger(Client.localClient, p);
|
||||
|
||||
initServers();
|
||||
Client client = servers.getClient(random().nextInt());
|
||||
client.queryDefaults().set( "shards", servers.getShards() );
|
||||
doBigger( client, p );
|
||||
}
|
||||
|
||||
public void doBigger(Client client, ModifiableSolrParams p) throws Exception {
|
||||
MacroExpander m = new MacroExpander(p.getMap());
|
||||
|
||||
String cat_s = m.expand("${cat_s}");
|
||||
String where_s = m.expand("${where_s}");
|
||||
|
||||
client.deleteByQuery("*:*", null);
|
||||
|
||||
Random r = new Random(0); // make deterministic
|
||||
int numCat = 1;
|
||||
int numWhere = 2000000000;
|
||||
int commitPercent = 10;
|
||||
int ndocs=1000;
|
||||
|
||||
Map<Integer, Map<Integer, List<Integer>>> model = new HashMap(); // cat->where->list<ids>
|
||||
for (int i=0; i<ndocs; i++) {
|
||||
Integer cat = r.nextInt(numCat);
|
||||
Integer where = r.nextInt(numWhere);
|
||||
client.add( sdoc("id", i, cat_s,cat, where_s, where) , null );
|
||||
Map<Integer,List<Integer>> sub = model.get(cat);
|
||||
if (sub == null) {
|
||||
sub = new HashMap<>();
|
||||
model.put(cat, sub);
|
||||
}
|
||||
List<Integer> ids = sub.get(where);
|
||||
if (ids == null) {
|
||||
ids = new ArrayList<>();
|
||||
sub.put(where, ids);
|
||||
}
|
||||
ids.add(i);
|
||||
|
||||
if (r.nextInt(100) < commitPercent) {
|
||||
client.commit();
|
||||
}
|
||||
}
|
||||
|
||||
client.commit();
|
||||
|
||||
int sz = model.get(0).size();
|
||||
|
||||
client.testJQ(params(p, "q", "*:*"
|
||||
, "json.facet", "{f1:{type:terms, field:${cat_s}, limit:2, facet:{x:'unique($where_s)'} }}"
|
||||
)
|
||||
, "facets=={ 'count':" + ndocs + "," +
|
||||
"'f1':{ 'buckets':[{ 'val':'0', 'count':" + ndocs + ", x:" + sz + " }]} } "
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void XtestPercentiles() {
|
||||
AVLTreeDigest catA = new AVLTreeDigest(100);
|
||||
catA.add(4);
|
||||
|
|
Loading…
Reference in New Issue