SOLR-7494: fix unique() function for high cardinality fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1677091 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2015-05-01 02:15:02 +00:00
parent f829d6be3e
commit cc371b4cdf
4 changed files with 92 additions and 51 deletions

View File

@ -233,6 +233,10 @@ Bug Fixes
* SOLR-7478: UpdateLog#close shuts down it's executor with interrupts before running it's close logic,
possibly preventing a clean close. (Mark Miller)
* SOLR-7494: Facet Module - unique() facet function was wildly inaccurate for high cardinality
fields. (Andy Crossen, yonik)
Optimizations
----------------------

View File

@ -386,17 +386,19 @@ abstract class UniqueSlotAcc extends SlotAcc {
int maxExplicit=100;
// TODO: make configurable
// TODO: share values across buckets
if (unique <= maxExplicit) {
if (unique > 0) {
List lst = new ArrayList( Math.min(unique, maxExplicit) );
if (ords != null) {
for (int ord=-1;;) {
if (++ord >= unique) break;
long maxOrd = ords.length();
if (ords != null && ords.length() > 0) {
for (int ord=0; lst.size() < maxExplicit;) {
ord = ords.nextSetBit(ord);
if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
BytesRef val = lookupOrd(ord);
Object o = field.getType().toObject(field, val);
lst.add(o);
if (++ord >= maxOrd) break;
}
}
@ -555,43 +557,6 @@ class UniqueMultivaluedSlotAcc extends UniqueSlotAcc implements UnInvertedField.
nTerms = uif.numTerms();
}
@Override
public Object getShardValue(int slot) throws IOException {
FixedBitSet ords = arr[slot];
int unique;
if (counts != null) {
unique = counts[slot];
} else {
unique = ords == null ? 0 : ords.cardinality();
}
SimpleOrderedMap map = new SimpleOrderedMap();
map.add("unique", unique);
map.add("nTerms", nTerms);
int maxExplicit=100;
// TODO: make configurable
// TODO: share values across buckets
if (unique <= maxExplicit) {
List lst = new ArrayList( Math.min(unique, maxExplicit) );
if (ords != null) {
for (int ord=-1;;) {
if (++ord >= unique) break;
ord = ords.nextSetBit(ord);
if (ord == DocIdSetIterator.NO_MORE_DOCS) break;
BytesRef val = docToTerm.lookupOrd(ord);
Object o = field.getType().toObject(field, val);
lst.add(o);
}
}
map.add("vals", lst);
}
return map;
}
@Override
protected BytesRef lookupOrd(int ord) throws IOException {
return docToTerm.lookupOrd(ord);

View File

@ -67,9 +67,10 @@ public class UniqueAgg extends StrAggValueSource {
}
private static class Merger extends FacetSortableMerger {
long answer = -1;
long sumUnique;
Set<Object> values;
int shardsMissing;
long sumAdded;
long shardsMissingSum;
long shardsMissingMax;
@ -79,24 +80,35 @@ public class UniqueAgg extends StrAggValueSource {
long unique = ((Number)map.get("unique")).longValue();
sumUnique += unique;
List vals = (List)map.get("vals");
int valsListed = 0;
List vals = (List) map.get("vals");
if (vals != null) {
if (values == null) {
values = new HashSet<>(vals.size()*4);
}
values.addAll(vals);
} else {
shardsMissing++;
shardsMissingSum += unique;
shardsMissingMax = Math.max(shardsMissingMax, unique);
valsListed = vals.size();
sumAdded += valsListed;
}
shardsMissingSum += unique - valsListed;
shardsMissingMax = Math.max(shardsMissingMax, unique - valsListed);
// TODO: somehow get & use the count in the bucket?
}
private long getLong() {
long exactCount = values == null ? 0 : values.size();
return exactCount + shardsMissingSum;
if (answer >= 0) return answer;
answer = values == null ? 0 : values.size();
if (answer == 0) {
// either a real "0", or no values returned from shards
answer = shardsMissingSum;
return answer;
}
double factor = ((double)values.size()) / sumAdded; // what fraction of listed values were unique
long estimate = (long)(shardsMissingSum * factor);
answer = values.size() + estimate;
return answer;
}
@Override

View File

@ -886,16 +886,76 @@ public class TestJsonFacets extends SolrTestCaseHS {
}
@Test
public void testDistrib() throws Exception {
initServers();
Client client = servers.getClient( random().nextInt() );
Client client = servers.getClient(random().nextInt());
client.queryDefaults().set( "shards", servers.getShards() );
doStats( client, params() );
}
@Test
public void testBigger() throws Exception {
ModifiableSolrParams p = params("rows", "0", "cat_s", "cat_ss", "where_s", "where_ss");
// doBigger(Client.localClient, p);
initServers();
Client client = servers.getClient(random().nextInt());
client.queryDefaults().set( "shards", servers.getShards() );
doBigger( client, p );
}
public void doBigger(Client client, ModifiableSolrParams p) throws Exception {
MacroExpander m = new MacroExpander(p.getMap());
String cat_s = m.expand("${cat_s}");
String where_s = m.expand("${where_s}");
client.deleteByQuery("*:*", null);
Random r = new Random(0); // make deterministic
int numCat = 1;
int numWhere = 2000000000;
int commitPercent = 10;
int ndocs=1000;
Map<Integer, Map<Integer, List<Integer>>> model = new HashMap(); // cat->where->list<ids>
for (int i=0; i<ndocs; i++) {
Integer cat = r.nextInt(numCat);
Integer where = r.nextInt(numWhere);
client.add( sdoc("id", i, cat_s,cat, where_s, where) , null );
Map<Integer,List<Integer>> sub = model.get(cat);
if (sub == null) {
sub = new HashMap<>();
model.put(cat, sub);
}
List<Integer> ids = sub.get(where);
if (ids == null) {
ids = new ArrayList<>();
sub.put(where, ids);
}
ids.add(i);
if (r.nextInt(100) < commitPercent) {
client.commit();
}
}
client.commit();
int sz = model.get(0).size();
client.testJQ(params(p, "q", "*:*"
, "json.facet", "{f1:{type:terms, field:${cat_s}, limit:2, facet:{x:'unique($where_s)'} }}"
)
, "facets=={ 'count':" + ndocs + "," +
"'f1':{ 'buckets':[{ 'val':'0', 'count':" + ndocs + ", x:" + sz + " }]} } "
);
}
public void XtestPercentiles() {
AVLTreeDigest catA = new AVLTreeDigest(100);
catA.add(4);