SOLR-7417: implement unique() for numeric fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675427 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2015-04-22 17:22:27 +00:00
parent 965105b8fe
commit a02b9a08a1
3 changed files with 178 additions and 4 deletions

View File

@ -111,6 +111,9 @@ New Features
* SOLR-7176: zkcli script can perfrom the CLUSTERPROP command without a running Solr cluster * SOLR-7176: zkcli script can perfrom the CLUSTERPROP command without a running Solr cluster
(Hrishikesh Gadre, Per Steffensen, Noble Paul) (Hrishikesh Gadre, Per Steffensen, Noble Paul)
* SOLR-7417: JSON Facet API - unique() is now implemented for numeric and date fields.
(yonik)
Bug Fixes Bug Fixes
---------------------- ----------------------

View File

@ -18,10 +18,18 @@ package org.apache.solr.search.facet;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.SchemaField;
@ -45,7 +53,11 @@ public class UniqueAgg extends StrAggValueSource {
return new UniqueMultivaluedSlotAcc(fcontext, getArg(), numSlots); return new UniqueMultivaluedSlotAcc(fcontext, getArg(), numSlots);
} }
} else { } else {
return new UniqueSinglevaluedSlotAcc(fcontext, getArg(), numSlots); if (sf.getType().getNumericType() != null) {
return new NumericAcc(fcontext, getArg(), numSlots);
} else {
return new UniqueSinglevaluedSlotAcc(fcontext, getArg(), numSlots);
}
} }
} }
@ -86,4 +98,163 @@ public class UniqueAgg extends StrAggValueSource {
} }
}; };
} }
static class LongSet {
static final float LOAD_FACTOR = 0.7f;
long[] vals;
int cardinality;
int mask;
int threshold;
int zeroCount; // 1 if a 0 was collected
/** sz must be a power of two */
LongSet(int sz) {
vals = new long[sz];
mask = sz - 1;
threshold = (int) (sz * LOAD_FACTOR);
}
void add(long val) {
if (val == 0) {
zeroCount = 1;
return;
}
if (cardinality >= threshold) {
rehash();
}
// For floats: exponent bits start at bit 23 for single precision,
// and bit 52 for double precision.
// Many values will only have significant bits just to the right of that,
// and the leftmost bits will all be zero.
// For now, lets just settle to get first 8 significant mantissa bits of double or float in the lowest bits of our hash
// The upper bits of our hash will be irrelevant.
int h = (int) (val + (val >>> 44) + (val >>> 15));
for (int slot = h & mask; ;slot = (slot + 1) & mask) {
long v = vals[slot];
if (v == 0) {
vals[slot] = val;
cardinality++;
break;
} else if (v == val) {
// val is already in the set
break;
}
}
}
private void rehash() {
long[] oldVals = vals;
int newCapacity = vals.length << 1;
vals = new long[newCapacity];
mask = newCapacity - 1;
threshold = (int) (newCapacity * LOAD_FACTOR);
cardinality = 0;
for (long val : oldVals) {
if (val != 0) {
add(val);
}
}
}
int cardinality() {
return cardinality + zeroCount;
}
}
class NumericAcc extends SlotAcc {
SchemaField sf;
LongSet[] sets;
NumericDocValues values;
Bits exists;
public NumericAcc(FacetContext fcontext, String field, int numSlots) throws IOException {
super(fcontext);
sf = fcontext.searcher.getSchema().getField(field);
sets = new LongSet[numSlots];
}
@Override
public void reset() {
sets = new LongSet[sets.length];
}
@Override
public void setNextReader(LeafReaderContext readerContext) throws IOException {
values = DocValues.getNumeric(readerContext.reader(), sf.getName());
exists = DocValues.getDocsWithField(readerContext.reader(), sf.getName());
}
@Override
public void collect(int doc, int slot) throws IOException {
long val = values.get(doc);
if (val == 0 && !exists.get(doc)) {
return;
}
LongSet set = sets[slot];
if (set == null) {
set = sets[slot] = new LongSet(16);
}
// TODO: could handle 0s at this level too
set.add(val);
}
@Override
public Object getValue(int slot) throws IOException {
if (fcontext.isShard()) {
return getShardValue(slot);
}
return getCardinality(slot);
}
private int getCardinality(int slot) {
LongSet set = sets[slot];
return set==null ? 0 : set.cardinality();
}
public Object getShardValue(int slot) throws IOException {
LongSet set = sets[slot];
int unique = getCardinality(slot);
SimpleOrderedMap map = new SimpleOrderedMap();
map.add("unique", unique);
int maxExplicit=100;
// TODO: make configurable
// TODO: share values across buckets
if (unique <= maxExplicit) {
List lst = new ArrayList( Math.min(unique, maxExplicit) );
if (set != null) {
if (set.zeroCount > 0) {
lst.add(0);
}
for (long val : set.vals) {
if (val != 0) {
lst.add(val);
}
}
}
map.add("vals", lst);
}
return map;
}
@Override
public int compare(int slotA, int slotB) {
return getCardinality(slotA) - getCardinality(slotB);
}
}
} }

View File

@ -611,15 +611,15 @@ public class TestJsonFacets extends SolrTestCaseHS {
// stats at top level // stats at top level
client.testJQ(params(p, "q", "*:*" client.testJQ(params(p, "q", "*:*"
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }" , "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }"
) )
, "facets=={ 'count':6, " + , "facets=={ 'count':6, " +
"sum1:3.0, sumsq1:247.0, avg1:0.5, min1:-9.0, max1:11.0, numwhere:2, med:2.0, perc:[-9.0,2.0,11.0] }" "sum1:3.0, sumsq1:247.0, avg1:0.5, min1:-9.0, max1:11.0, numwhere:2, unique_num_i:4, unique_num_d:5, unique_date:5, med:2.0, perc:[-9.0,2.0,11.0] }"
); );
// stats at top level, no matches // stats at top level, no matches
client.testJQ(params(p, "q", "id:DOESNOTEXIST" client.testJQ(params(p, "q", "id:DOESNOTEXIST"
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }" , "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }"
) )
, "facets=={count:0 " + , "facets=={count:0 " +
"/* ,sum1:0.0, sumsq1:0.0, avg1:0.0, min1:'NaN', max1:'NaN', numwhere:0 */ }" "/* ,sum1:0.0, sumsq1:0.0, avg1:0.0, min1:'NaN', max1:'NaN', numwhere:0 */ }"