mirror of
https://github.com/apache/lucene.git
synced 2025-02-24 11:16:35 +00:00
SOLR-7417: implement unique() for numeric fields
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1675427 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
965105b8fe
commit
a02b9a08a1
@ -111,6 +111,9 @@ New Features
|
||||
* SOLR-7176: zkcli script can perfrom the CLUSTERPROP command without a running Solr cluster
|
||||
(Hrishikesh Gadre, Per Steffensen, Noble Paul)
|
||||
|
||||
* SOLR-7417: JSON Facet API - unique() is now implemented for numeric and date fields.
|
||||
(yonik)
|
||||
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
@ -18,10 +18,18 @@ package org.apache.solr.search.facet;
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
|
||||
@ -45,7 +53,11 @@ public class UniqueAgg extends StrAggValueSource {
|
||||
return new UniqueMultivaluedSlotAcc(fcontext, getArg(), numSlots);
|
||||
}
|
||||
} else {
|
||||
return new UniqueSinglevaluedSlotAcc(fcontext, getArg(), numSlots);
|
||||
if (sf.getType().getNumericType() != null) {
|
||||
return new NumericAcc(fcontext, getArg(), numSlots);
|
||||
} else {
|
||||
return new UniqueSinglevaluedSlotAcc(fcontext, getArg(), numSlots);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -86,4 +98,163 @@ public class UniqueAgg extends StrAggValueSource {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
static class LongSet {
|
||||
|
||||
static final float LOAD_FACTOR = 0.7f;
|
||||
|
||||
long[] vals;
|
||||
int cardinality;
|
||||
int mask;
|
||||
int threshold;
|
||||
int zeroCount; // 1 if a 0 was collected
|
||||
|
||||
/** sz must be a power of two */
|
||||
LongSet(int sz) {
|
||||
vals = new long[sz];
|
||||
mask = sz - 1;
|
||||
threshold = (int) (sz * LOAD_FACTOR);
|
||||
}
|
||||
|
||||
void add(long val) {
|
||||
if (val == 0) {
|
||||
zeroCount = 1;
|
||||
return;
|
||||
}
|
||||
if (cardinality >= threshold) {
|
||||
rehash();
|
||||
}
|
||||
|
||||
// For floats: exponent bits start at bit 23 for single precision,
|
||||
// and bit 52 for double precision.
|
||||
// Many values will only have significant bits just to the right of that,
|
||||
// and the leftmost bits will all be zero.
|
||||
|
||||
// For now, lets just settle to get first 8 significant mantissa bits of double or float in the lowest bits of our hash
|
||||
// The upper bits of our hash will be irrelevant.
|
||||
int h = (int) (val + (val >>> 44) + (val >>> 15));
|
||||
for (int slot = h & mask; ;slot = (slot + 1) & mask) {
|
||||
long v = vals[slot];
|
||||
if (v == 0) {
|
||||
vals[slot] = val;
|
||||
cardinality++;
|
||||
break;
|
||||
} else if (v == val) {
|
||||
// val is already in the set
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void rehash() {
|
||||
long[] oldVals = vals;
|
||||
int newCapacity = vals.length << 1;
|
||||
vals = new long[newCapacity];
|
||||
mask = newCapacity - 1;
|
||||
threshold = (int) (newCapacity * LOAD_FACTOR);
|
||||
cardinality = 0;
|
||||
|
||||
for (long val : oldVals) {
|
||||
if (val != 0) {
|
||||
add(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int cardinality() {
|
||||
return cardinality + zeroCount;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class NumericAcc extends SlotAcc {
|
||||
SchemaField sf;
|
||||
LongSet[] sets;
|
||||
NumericDocValues values;
|
||||
Bits exists;
|
||||
|
||||
public NumericAcc(FacetContext fcontext, String field, int numSlots) throws IOException {
|
||||
super(fcontext);
|
||||
sf = fcontext.searcher.getSchema().getField(field);
|
||||
sets = new LongSet[numSlots];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
sets = new LongSet[sets.length];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(LeafReaderContext readerContext) throws IOException {
|
||||
values = DocValues.getNumeric(readerContext.reader(), sf.getName());
|
||||
exists = DocValues.getDocsWithField(readerContext.reader(), sf.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
long val = values.get(doc);
|
||||
if (val == 0 && !exists.get(doc)) {
|
||||
return;
|
||||
}
|
||||
|
||||
LongSet set = sets[slot];
|
||||
if (set == null) {
|
||||
set = sets[slot] = new LongSet(16);
|
||||
}
|
||||
// TODO: could handle 0s at this level too
|
||||
set.add(val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getValue(int slot) throws IOException {
|
||||
if (fcontext.isShard()) {
|
||||
return getShardValue(slot);
|
||||
}
|
||||
return getCardinality(slot);
|
||||
}
|
||||
|
||||
private int getCardinality(int slot) {
|
||||
LongSet set = sets[slot];
|
||||
return set==null ? 0 : set.cardinality();
|
||||
}
|
||||
|
||||
public Object getShardValue(int slot) throws IOException {
|
||||
LongSet set = sets[slot];
|
||||
int unique = getCardinality(slot);
|
||||
|
||||
SimpleOrderedMap map = new SimpleOrderedMap();
|
||||
map.add("unique", unique);
|
||||
|
||||
int maxExplicit=100;
|
||||
// TODO: make configurable
|
||||
// TODO: share values across buckets
|
||||
if (unique <= maxExplicit) {
|
||||
List lst = new ArrayList( Math.min(unique, maxExplicit) );
|
||||
if (set != null) {
|
||||
if (set.zeroCount > 0) {
|
||||
lst.add(0);
|
||||
}
|
||||
for (long val : set.vals) {
|
||||
if (val != 0) {
|
||||
lst.add(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
map.add("vals", lst);
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compare(int slotA, int slotB) {
|
||||
return getCardinality(slotA) - getCardinality(slotB);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -611,15 +611,15 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
||||
|
||||
// stats at top level
|
||||
client.testJQ(params(p, "q", "*:*"
|
||||
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }"
|
||||
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }"
|
||||
)
|
||||
, "facets=={ 'count':6, " +
|
||||
"sum1:3.0, sumsq1:247.0, avg1:0.5, min1:-9.0, max1:11.0, numwhere:2, med:2.0, perc:[-9.0,2.0,11.0] }"
|
||||
"sum1:3.0, sumsq1:247.0, avg1:0.5, min1:-9.0, max1:11.0, numwhere:2, unique_num_i:4, unique_num_d:5, unique_date:5, med:2.0, perc:[-9.0,2.0,11.0] }"
|
||||
);
|
||||
|
||||
// stats at top level, no matches
|
||||
client.testJQ(params(p, "q", "id:DOESNOTEXIST"
|
||||
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }"
|
||||
, "json.facet", "{ sum1:'sum(${num_d})', sumsq1:'sumsq(${num_d})', avg1:'avg(${num_d})', min1:'min(${num_d})', max1:'max(${num_d})', numwhere:'unique(${where_s})', unique_num_i:'unique(${num_i})', unique_num_d:'unique(${num_d})', unique_date:'unique(${date})', med:'percentile(${num_d},50)', perc:'percentile(${num_d},0,50.0,100)' }"
|
||||
)
|
||||
, "facets=={count:0 " +
|
||||
"/* ,sum1:0.0, sumsq1:0.0, avg1:0.0, min1:'NaN', max1:'NaN', numwhere:0 */ }"
|
||||
|
Loading…
x
Reference in New Issue
Block a user