mirror of https://github.com/apache/lucene.git
SOLR-9480: A new 'relatedness()' aggregate function for JSON Faceting to enable building Semantic Knowledge Graphs
This commit is contained in:
parent
0c0fce3e98
commit
669b9e7a53
|
@ -116,6 +116,9 @@ New Features
|
|||
|
||||
* SOLR-11277: Add auto hard-commit settings based on tlog size (Rupa Shankar, Anshum Gupta)
|
||||
|
||||
* SOLR-9480: A new 'relatedness()' aggregate function for JSON Faceting to enable building Semantic
|
||||
Knowledge Graphs. (Trey Grainger, hossman)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -63,6 +63,7 @@ import org.apache.solr.search.facet.PercentileAgg;
|
|||
import org.apache.solr.search.facet.StddevAgg;
|
||||
import org.apache.solr.search.facet.SumAgg;
|
||||
import org.apache.solr.search.facet.SumsqAgg;
|
||||
import org.apache.solr.search.facet.RelatednessAgg;
|
||||
import org.apache.solr.search.facet.UniqueAgg;
|
||||
import org.apache.solr.search.facet.UniqueBlockAgg;
|
||||
import org.apache.solr.search.facet.VarianceAgg;
|
||||
|
@ -1039,6 +1040,17 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin {
|
|||
|
||||
addParser("agg_percentile", new PercentileAgg.Parser());
|
||||
|
||||
addParser("agg_" + RelatednessAgg.NAME, new ValueSourceParser() {
|
||||
@Override
|
||||
public ValueSource parse(FunctionQParser fp) throws SyntaxError {
|
||||
// TODO: (fore & back)-ground should be optional -- use hasMoreArguments
|
||||
// if only one arg, assume it's the foreground
|
||||
// (background is the one that will most commonly just be "*:*")
|
||||
// see notes in RelatednessAgg constructor about why we don't do this yet
|
||||
return new RelatednessAgg(fp.parseNestedQuery(), fp.parseNestedQuery());
|
||||
}
|
||||
});
|
||||
|
||||
addParser("childfield", new ChildFieldValueSourceParser());
|
||||
}
|
||||
|
||||
|
|
|
@ -42,7 +42,7 @@ abstract class FacetRequestSorted extends FacetRequest {
|
|||
|
||||
@Override
|
||||
public boolean returnsPartial() {
|
||||
return limit > 0;
|
||||
return super.returnsPartial() || (limit > 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.solr.common.util.SimpleOrderedMap;
|
|||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
|
||||
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
|
||||
|
||||
|
@ -226,23 +227,23 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
}
|
||||
}
|
||||
|
||||
int collectFirstPhase(DocSet docs, int slot) throws IOException {
|
||||
int collectFirstPhase(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
int num = -1;
|
||||
if (collectAcc != null) {
|
||||
num = collectAcc.collect(docs, slot);
|
||||
num = collectAcc.collect(docs, slot, slotContext);
|
||||
}
|
||||
if (allBucketsAcc != null) {
|
||||
num = allBucketsAcc.collect(docs, slot);
|
||||
num = allBucketsAcc.collect(docs, slot, slotContext);
|
||||
}
|
||||
return num >= 0 ? num : docs.size();
|
||||
}
|
||||
|
||||
void collectFirstPhase(int segDoc, int slot) throws IOException {
|
||||
void collectFirstPhase(int segDoc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
if (collectAcc != null) {
|
||||
collectAcc.collect(segDoc, slot);
|
||||
collectAcc.collect(segDoc, slot, slotContext);
|
||||
}
|
||||
if (allBucketsAcc != null) {
|
||||
allBucketsAcc.collect(segDoc, slot);
|
||||
allBucketsAcc.collect(segDoc, slot, slotContext);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -373,7 +374,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
Comparable val = bucketValFromSlotNumFunc.apply(slotNum);
|
||||
bucket.add("val", val);
|
||||
|
||||
Query filter = needFilter ? sf.getType().getFieldQuery(null, sf, fieldQueryValFunc.apply(val)) : null;
|
||||
Query filter = needFilter ? makeBucketQuery(fieldQueryValFunc.apply(val)) : null;
|
||||
|
||||
fillBucket(bucket, countAcc.getCount(slotNum), slotNum, null, filter);
|
||||
|
||||
|
@ -388,6 +389,15 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Trivial helper method for building up a bucket query given the (Stringified) bucket value
|
||||
*/
|
||||
protected Query makeBucketQuery(final String bucketValue) {
|
||||
// TODO: this isn't viable for things like text fields w/ analyzers that are non-idempotent (ie: stemmers)
|
||||
// TODO: but changing it to just use TermQuery isn't safe for things like numerics, dates, etc...
|
||||
return sf.getType().getFieldQuery(null, sf, bucketValue);
|
||||
}
|
||||
|
||||
private void calculateNumBuckets(SimpleOrderedMap<Object> target) throws IOException {
|
||||
DocSet domain = fcontext.base;
|
||||
if (freq.prefix != null) {
|
||||
|
@ -397,7 +407,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
|
||||
HLLAgg agg = new HLLAgg(freq.field);
|
||||
SlotAcc acc = agg.createSlotAcc(fcontext, domain.size(), 1);
|
||||
acc.collect(domain, 0);
|
||||
acc.collect(domain, 0, null); // we know HLL doesn't care about the bucket query
|
||||
acc.key = "numBuckets";
|
||||
acc.setValues(target, 0);
|
||||
}
|
||||
|
@ -433,7 +443,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
// do acc at a time (traversing domain each time) or do all accs for each doc?
|
||||
for (SlotAcc acc : otherAccs) {
|
||||
acc.reset(); // TODO: only needed if we previously used for allBuckets or missing
|
||||
acc.collect(subDomain, 0);
|
||||
acc.collect(subDomain, 0, slot -> { return new SlotContext(filter); });
|
||||
acc.setValues(target, 0);
|
||||
}
|
||||
}
|
||||
|
@ -442,13 +452,14 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void processStats(SimpleOrderedMap<Object> bucket, DocSet docs, int docCount) throws IOException {
|
||||
protected void processStats(SimpleOrderedMap<Object> bucket, Query bucketQ, DocSet docs, int docCount) throws IOException {
|
||||
if (docCount == 0 && !freq.processEmpty || freq.getFacetStats().size() == 0) {
|
||||
bucket.add("count", docCount);
|
||||
return;
|
||||
}
|
||||
createAccs(docCount, 1);
|
||||
int collected = collect(docs, 0);
|
||||
assert null != bucketQ;
|
||||
int collected = collect(docs, 0, slotNum -> { return new SlotContext(bucketQ); });
|
||||
|
||||
// countAcc.incrementCount(0, collected); // should we set the counton the acc instead of just passing it?
|
||||
|
||||
|
@ -499,9 +510,9 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
for (SlotAcc acc : subAccs) {
|
||||
acc.collect(doc, slot);
|
||||
acc.collect(doc, slot, slotContext);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -561,15 +572,15 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
assert slot != collectAccSlot || slot < 0;
|
||||
count++;
|
||||
if (collectAcc != null) {
|
||||
collectAcc.collect(doc, collectAccSlot);
|
||||
collectAcc.collect(doc, collectAccSlot, slotContext);
|
||||
}
|
||||
if (otherAccs != null) {
|
||||
for (SlotAcc otherAcc : otherAccs) {
|
||||
otherAcc.collect(doc, otherAccsSlot);
|
||||
otherAcc.collect(doc, otherAccsSlot, slotContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,11 +19,14 @@ package org.apache.solr.search.facet;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
|
||||
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
|
||||
|
||||
|
@ -116,8 +119,28 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
},
|
||||
obj -> obj instanceof Date ? ((Date)obj).toInstant().toString() : obj.toString()
|
||||
obj -> valueObjToString(obj)
|
||||
);
|
||||
}
|
||||
|
||||
private static String valueObjToString(Object obj) {
|
||||
return (obj instanceof Date) ? ((Date)obj).toInstant().toString() : obj.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* SlotContext to use during all {@link SlotAcc} collection.
|
||||
*
|
||||
* @see #lookupOrd
|
||||
*/
|
||||
public IntFunction<SlotContext> slotContext = (slotNum) -> {
|
||||
try {
|
||||
Object value = sf.getType().toObject(sf, lookupOrd(slotNum));
|
||||
Query q = makeBucketQuery(valueObjToString(value));
|
||||
assert null != q : "null query for: '" + value + "'";
|
||||
return new SlotContext(q);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -327,10 +327,10 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
if (arrIdx >= 0 && arrIdx < nTerms) {
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
if (collectAcc != null) {
|
||||
collectAcc.collect(doc, arrIdx);
|
||||
collectAcc.collect(doc, arrIdx, slotContext);
|
||||
}
|
||||
if (allBucketsAcc != null) {
|
||||
allBucketsAcc.collect(doc, arrIdx);
|
||||
allBucketsAcc.collect(doc, arrIdx, slotContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.Closeable;
|
|||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiPostingsEnum;
|
||||
|
@ -40,6 +41,7 @@ import org.apache.solr.search.DocSet;
|
|||
import org.apache.solr.search.HashDocSet;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.search.SortedIntDocSet;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
|
||||
/**
|
||||
* Enumerates indexed terms in order in a streaming fashion.
|
||||
|
@ -60,6 +62,13 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement
|
|||
PostingsEnum postingsEnum;
|
||||
BytesRef startTermBytes;
|
||||
BytesRef term;
|
||||
|
||||
// at any point in processing where we need a SlotContext, all we care about is the current 'term'
|
||||
private IntFunction<SlotContext> slotContext = (slotNum) -> {
|
||||
assert null != term;
|
||||
return new SlotAcc.SlotContext(new TermQuery(new Term(sf.getName(), term)));
|
||||
};
|
||||
|
||||
LeafReaderContext[] leaves;
|
||||
|
||||
FacetFieldProcessorByEnumTermsStream(FacetContext fcontext, FacetField freq, SchemaField sf) {
|
||||
|
@ -195,7 +204,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement
|
|||
|
||||
private SimpleOrderedMap<Object> _nextBucket() throws IOException {
|
||||
DocSet termSet = null;
|
||||
|
||||
|
||||
try {
|
||||
while (term != null) {
|
||||
|
||||
|
@ -241,7 +250,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement
|
|||
resetStats();
|
||||
|
||||
if (!countOnly) {
|
||||
collect(termSet, 0);
|
||||
collect(termSet, 0, slotContext);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
@ -278,7 +287,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement
|
|||
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (fastForRandomSet.exists(docid + base)) {
|
||||
c++;
|
||||
collect(docid, 0);
|
||||
collect(docid, 0, slotContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -295,7 +304,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement
|
|||
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (fastForRandomSet.exists(docid)) {
|
||||
c++;
|
||||
collect(docid, 0);
|
||||
collect(docid, 0, slotContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.solr.common.SolrException;
|
|||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.DocSetUtil;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
|
||||
/**
|
||||
* Facets numbers into a hash table. The number is either a raw numeric DocValues value, or
|
||||
|
@ -260,7 +261,7 @@ class FacetFieldProcessorByHashDV extends FacetFieldProcessor {
|
|||
|
||||
indexOrderAcc = new SlotAcc(fcontext) {
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -306,7 +307,7 @@ class FacetFieldProcessorByHashDV extends FacetFieldProcessor {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
@ -429,7 +430,10 @@ class FacetFieldProcessorByHashDV extends FacetFieldProcessor {
|
|||
// Our countAcc is virtual, so this is not needed:
|
||||
// countAcc.incrementCount(slot, 1);
|
||||
|
||||
super.collectFirstPhase(segDoc, slot);
|
||||
super.collectFirstPhase(segDoc, slot, slotNum -> {
|
||||
Comparable value = calc.bitsToValue(val);
|
||||
return new SlotContext(sf.getType().getFieldQuery(null, sf, calc.formatValue(value)));
|
||||
});
|
||||
}
|
||||
|
||||
private void doRehash(LongCounts table) {
|
||||
|
|
|
@ -120,8 +120,10 @@ public abstract class FacetMerger {
|
|||
|
||||
subs = null;
|
||||
for (Map.Entry<String,FacetRequest> entry : freq.subFacets.entrySet()) {
|
||||
Collection<String> childSubs = getSubsWithPartial(entry.getValue());
|
||||
if (childSubs.size() > 0 || entry.getValue().returnsPartial()) {
|
||||
final FacetRequest entryVal = entry.getValue();
|
||||
Collection<String> childSubs = getSubsWithPartial(entryVal);
|
||||
// TODO: should returnsPartial() check processEmpty internally?
|
||||
if (childSubs.size() > 0 || entryVal.returnsPartial() || entryVal.processEmpty) {
|
||||
if (subs == null) {
|
||||
subs = new ArrayList<>(freq.getSubFacets().size());
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Iterator;
|
|||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
|
@ -42,6 +43,7 @@ import org.apache.solr.search.QParser;
|
|||
import org.apache.solr.search.QueryContext;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.search.SyntaxError;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
import org.apache.solr.util.RTimer;
|
||||
|
||||
public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
||||
|
@ -78,6 +80,7 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
FacetProcessor(FacetContext fcontext, FacetRequestT freq) {
|
||||
this.fcontext = fcontext;
|
||||
this.freq = freq;
|
||||
fcontext.processor = this;
|
||||
}
|
||||
|
||||
public Object getResponse() {
|
||||
|
@ -90,10 +93,13 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
|
||||
private void evalFilters() throws IOException {
|
||||
if (freq.domain.filters == null || freq.domain.filters.isEmpty()) return;
|
||||
|
||||
List<Query> qlist = new ArrayList<>(freq.domain.filters.size());
|
||||
this.filter = fcontext.searcher.getDocSet(evalJSONFilterQueryStruct(fcontext, freq.domain.filters));
|
||||
}
|
||||
|
||||
private static List<Query> evalJSONFilterQueryStruct(FacetContext fcontext, List<Object> filters) throws IOException {
|
||||
List<Query> qlist = new ArrayList<>(filters.size());
|
||||
// TODO: prevent parsing filters each time!
|
||||
for (Object rawFilter : freq.domain.filters) {
|
||||
for (Object rawFilter : filters) {
|
||||
if (rawFilter instanceof String) {
|
||||
QParser parser = null;
|
||||
try {
|
||||
|
@ -149,13 +155,30 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
this.filter = fcontext.searcher.getDocSet(qlist);
|
||||
return qlist;
|
||||
}
|
||||
|
||||
private void handleDomainChanges() throws IOException {
|
||||
if (freq.domain == null) return;
|
||||
handleFilterExclusions();
|
||||
|
||||
if (null != freq.domain.explicitQueries) {
|
||||
try {
|
||||
final List<Query> domainQs = evalJSONFilterQueryStruct(fcontext, freq.domain.explicitQueries);
|
||||
if (domainQs.isEmpty()) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
"'query' domain must not evaluate to an empty list of queries");
|
||||
}
|
||||
fcontext.base = fcontext.searcher.getDocSet(domainQs);
|
||||
} catch (SolrException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
"Unable to parse domain 'query': " + freq.domain.explicitQueries +
|
||||
" -- reason: " + e.getMessage(),
|
||||
e);
|
||||
}
|
||||
} else {
|
||||
// mutualy exclusive to freq.domain.explicitQueries
|
||||
handleFilterExclusions();
|
||||
}
|
||||
|
||||
// Check filters... if we do have filters they apply after domain changes.
|
||||
// We still calculate them first because we can use it in a parent->child domain change.
|
||||
|
@ -277,13 +300,13 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
return appliedFilters;
|
||||
}
|
||||
|
||||
protected void processStats(SimpleOrderedMap<Object> bucket, DocSet docs, int docCount) throws IOException {
|
||||
protected void processStats(SimpleOrderedMap<Object> bucket, Query bucketQ, DocSet docs, int docCount) throws IOException {
|
||||
if (docCount == 0 && !freq.processEmpty || freq.getFacetStats().size() == 0) {
|
||||
bucket.add("count", docCount);
|
||||
return;
|
||||
}
|
||||
createAccs(docCount, 1);
|
||||
int collected = collect(docs, 0);
|
||||
int collected = collect(docs, 0, slotNum -> { return new SlotContext(bucketQ); });
|
||||
countAcc.incrementCount(0, collected);
|
||||
assert collected == docCount;
|
||||
addStats(bucket, 0);
|
||||
|
@ -319,10 +342,22 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
}
|
||||
}
|
||||
|
||||
int collect(DocSet docs, int slot) throws IOException {
|
||||
int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
int count = 0;
|
||||
SolrIndexSearcher searcher = fcontext.searcher;
|
||||
|
||||
if (0 == docs.size()) {
|
||||
// we may be in a "processEmpty" type situation where the client still cares about this bucket
|
||||
// either way, we should let our accumulators know about the empty set, so they can collect &
|
||||
// compute the slot (ie: let them decide if they care even when it's size==0)
|
||||
if (accs != null) {
|
||||
for (SlotAcc acc : accs) {
|
||||
acc.collect(docs, slot, slotContext); // NOT per-seg collectors
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
|
||||
final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
|
||||
LeafReaderContext ctx = null;
|
||||
|
@ -346,15 +381,15 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
setNextReader(ctx);
|
||||
}
|
||||
count++;
|
||||
collect(doc - segBase, slot); // per-seg collectors
|
||||
collect(doc - segBase, slot, slotContext); // per-seg collectors
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
void collect(int segDoc, int slot) throws IOException {
|
||||
void collect(int segDoc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
if (accs != null) {
|
||||
for (SlotAcc acc : accs) {
|
||||
acc.collect(segDoc, slot);
|
||||
acc.collect(segDoc, slot, slotContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -402,7 +437,7 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
|
||||
try {
|
||||
if (!skip) {
|
||||
processStats(bucket, result, count);
|
||||
processStats(bucket, q, result, count);
|
||||
}
|
||||
processSubs(bucket, q, result, skip, facetInfo);
|
||||
} finally {
|
||||
|
|
|
@ -38,6 +38,7 @@ import org.apache.solr.schema.SchemaField;
|
|||
import org.apache.solr.schema.TrieDateField;
|
||||
import org.apache.solr.schema.TrieField;
|
||||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
import org.apache.solr.util.DateMathParser;
|
||||
|
||||
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
|
||||
|
@ -364,7 +365,7 @@ class FacetRangeProcessor extends FacetProcessor<FacetRange> {
|
|||
DocSet intersection = fcontext.searcher.getDocSet(rangeQ, fcontext.base);
|
||||
filters[slot] = rangeQ;
|
||||
intersections[slot] = intersection; // save for later // TODO: only save if number of slots is small enough?
|
||||
int num = collect(intersection, slot);
|
||||
int num = collect(intersection, slot, slotNum -> { return new SlotContext(rangeQ); });
|
||||
countAcc.incrementCount(slot, num); // TODO: roll this into collect()
|
||||
}
|
||||
|
||||
|
|
|
@ -87,6 +87,15 @@ public abstract class FacetRequest {
|
|||
|
||||
// domain changes
|
||||
public static class Domain {
|
||||
/**
|
||||
* An explicit query domain, <em>ignoring all parent context</em>, expressed in JSON query format.
|
||||
* Mutually exclusive to {@link #excludeTags}
|
||||
*/
|
||||
public List<Object> explicitQueries; // list of symbolic filters (JSON query format)
|
||||
/**
|
||||
* Specifies query/filter tags that should be excluded to re-compute the domain from the parent context.
|
||||
* Mutually exclusive to {@link #explicitQueries}
|
||||
*/
|
||||
public List<String> excludeTags;
|
||||
public JoinField joinField;
|
||||
public boolean toParent;
|
||||
|
@ -96,12 +105,13 @@ public abstract class FacetRequest {
|
|||
|
||||
// True if a starting set of documents can be mapped onto a different set of documents not originally in the starting set.
|
||||
public boolean canTransformDomain() {
|
||||
return toParent || toChildren || (excludeTags != null) || (joinField != null);
|
||||
return toParent || toChildren
|
||||
|| (explicitQueries != null) || (excludeTags != null) || (joinField != null);
|
||||
}
|
||||
|
||||
// Can this domain become non-empty if the input domain is empty? This does not check any sub-facets (see canProduceFromEmpty for that)
|
||||
public boolean canBecomeNonEmpty() {
|
||||
return excludeTags != null;
|
||||
return (explicitQueries != null) || (excludeTags != null);
|
||||
}
|
||||
|
||||
/** Are we doing a query time join across other documents */
|
||||
|
@ -197,6 +207,7 @@ public abstract class FacetRequest {
|
|||
* This is normally true only for facets with a limit.
|
||||
*/
|
||||
public boolean returnsPartial() {
|
||||
// TODO: should the default impl check processEmpty ?
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -242,6 +253,7 @@ class FacetContext {
|
|||
public static final int IS_REFINEMENT=0x02;
|
||||
public static final int SKIP_FACET=0x04; // refinement: skip calculating this immediate facet, but proceed to specific sub-facets based on facetInfo
|
||||
|
||||
FacetProcessor processor;
|
||||
Map<String,Object> facetInfo; // refinement info for this node
|
||||
QueryContext qcontext;
|
||||
SolrQueryRequest req; // TODO: replace with params?
|
||||
|
@ -459,6 +471,17 @@ abstract class FacetParser<FacetRequestT extends FacetRequest> {
|
|||
domain.excludeTags = excludeTags;
|
||||
}
|
||||
|
||||
if (domainMap.containsKey("query")) {
|
||||
domain.explicitQueries = parseJSONQueryStruct(domainMap.get("query"));
|
||||
if (null == domain.explicitQueries) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
"'query' domain can not be null or empty");
|
||||
} else if (null != domain.excludeTags) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
"'query' domain can not be combined with 'excludeTags'");
|
||||
}
|
||||
}
|
||||
|
||||
String blockParent = (String)domainMap.get("blockParent");
|
||||
String blockChildren = (String)domainMap.get("blockChildren");
|
||||
|
||||
|
@ -475,21 +498,29 @@ abstract class FacetParser<FacetRequestT extends FacetRequest> {
|
|||
Object filterOrList = domainMap.get("filter");
|
||||
if (filterOrList != null) {
|
||||
assert domain.filters == null;
|
||||
if (filterOrList instanceof List) {
|
||||
domain.filters = (List<Object>)filterOrList;
|
||||
} else {
|
||||
domain.filters = new ArrayList<>(1);
|
||||
domain.filters.add(filterOrList);
|
||||
}
|
||||
domain.filters = parseJSONQueryStruct(filterOrList);
|
||||
}
|
||||
|
||||
|
||||
} // end "domain"
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/** returns null on null input, otherwise returns a list of the JSON query structures -- either
|
||||
* directly from the raw (list) input, or if raw input is a not a list then it encapsulates
|
||||
* it in a new list.
|
||||
*/
|
||||
private List<Object> parseJSONQueryStruct(Object raw) {
|
||||
List<Object> result = null;
|
||||
if (null == raw) {
|
||||
return result;
|
||||
} else if (raw instanceof List) {
|
||||
result = (List<Object>) raw;
|
||||
} else {
|
||||
result = new ArrayList<>(1);
|
||||
result.add(raw);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public String getField(Map<String,Object> args) {
|
||||
Object fieldName = args.get("field"); // TODO: pull out into defined constant
|
||||
|
|
|
@ -162,7 +162,7 @@ abstract class FacetRequestSortedMerger<FacetRequestT extends FacetRequestSorted
|
|||
|
||||
// TODO: add information in sub-shard response about dropped buckets (i.e. not all returned due to limit)
|
||||
// If we know we've seen all the buckets from a shard, then we don't have to add to leafBuckets or partialBuckets, only skipBuckets
|
||||
boolean isCommandPartial = freq.returnsPartial();
|
||||
boolean isCommandPartial = freq.returnsPartial() || freq.processEmpty; // TODO: should returnsPartial() check processEmpty internally?
|
||||
boolean returnedAllBuckets = !isCommandPartial && !thisMissing; // did the shard return all of the possible buckets?
|
||||
|
||||
if (returnedAllBuckets && tags.isEmpty() && tagsWithPartial.isEmpty()) {
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.SortedNumericDocValues;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
@ -140,7 +141,7 @@ public class HLLAgg extends StrAggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
int valuesDocID = docIdSetIterator().docID();
|
||||
if (valuesDocID < doc) {
|
||||
valuesDocID = docIdSetIterator().advance(doc);
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.search.facet;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiDocValues;
|
||||
|
@ -142,7 +143,7 @@ public class MinMaxAgg extends SimpleAggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
double val = values.doubleVal(doc);
|
||||
if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query
|
||||
|
||||
|
@ -171,7 +172,7 @@ public class MinMaxAgg extends SimpleAggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
long val = values.longVal(doc);
|
||||
if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query
|
||||
|
||||
|
@ -230,7 +231,7 @@ public class MinMaxAgg extends SimpleAggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
long val = values.longVal(doc);
|
||||
if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query
|
||||
|
||||
|
@ -334,7 +335,7 @@ public class MinMaxAgg extends SimpleAggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
if (subDv.advanceExact(doc)) {
|
||||
int segOrd = subDv.ordValue();
|
||||
int ord = toGlobal==null ? segOrd : (int)toGlobal.get(segOrd);
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.nio.ByteBuffer;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import com.tdunning.math.stats.AVLTreeDigest;
|
||||
import org.apache.lucene.queries.function.ValueSource;
|
||||
|
@ -109,7 +110,7 @@ public class PercentileAgg extends SimpleAggValueSource {
|
|||
digests = new AVLTreeDigest[numSlots];
|
||||
}
|
||||
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
if (!values.exists(doc)) return;
|
||||
double val = values.doubleVal(doc);
|
||||
|
||||
|
|
|
@ -0,0 +1,449 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Map;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.queries.function.FunctionValues;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.search.DocSet;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* An aggregation function designed to be nested under other (possibly deeply nested) facets for
|
||||
* the purposes of computing the "relatedness" of facet buckets relative to
|
||||
* "foreground" and "background" sets -- primarily for the purpose of building "Semantic Knowledge Graphs"
|
||||
*
|
||||
* @see <a href="https://arxiv.org/pdf/1609.00464.pdf">The Semantic Knowledge Graph:
|
||||
* A compact, auto-generated model for real-time traversal and ranking of any relationship
|
||||
* within a domain</a>
|
||||
*/
|
||||
public class RelatednessAgg extends AggValueSource {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
// end user values
|
||||
private static final String RELATEDNESS = "relatedness";
|
||||
private static final String FG_POP = "foreground_popularity";
|
||||
private static final String BG_POP = "background_popularity";
|
||||
|
||||
// needed for distrib calculation
|
||||
private static final String FG_SIZE = "foreground_size";
|
||||
private static final String FG_COUNT = "foreground_count";
|
||||
private static final String BG_SIZE = "background_size";
|
||||
private static final String BG_COUNT = "background_count";
|
||||
|
||||
final protected Query fgQ;
|
||||
final protected Query bgQ;
|
||||
|
||||
public static final String NAME = RELATEDNESS;
|
||||
public RelatednessAgg(Query fgQ, Query bgQ) {
|
||||
super(NAME);
|
||||
// NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null
|
||||
// keep them null until it's time to created a SlotAcc, at which point we might inherit values
|
||||
// from an ancestor facet context w/same key -- see comments in createSlotAcc
|
||||
this.fgQ = fgQ;
|
||||
this.bgQ = bgQ;
|
||||
|
||||
// TODO: defaults not supported yet -- see comments in createSlotAcc
|
||||
if (null == fgQ || null == bgQ) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
NAME + " aggregate function requires both foreground & background " +
|
||||
"to be real (non-null) queries");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String description() {
|
||||
// TODO: need better output processing when we start supporting null fgQ/bgQ in constructor
|
||||
return name +"(" + fgQ + "," + bgQ + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (! Objects.equals(this.getClass(), o.getClass())) {
|
||||
return false;
|
||||
}
|
||||
RelatednessAgg that = (RelatednessAgg) o;
|
||||
return Objects.equals(fgQ, that.fgQ) && Objects.equals(bgQ, that.bgQ);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(getClass(), fgQ, bgQ);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
|
||||
throw new UnsupportedOperationException("NOT IMPLEMENTED " + name + " " + this);
|
||||
}
|
||||
|
||||
|
||||
public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) throws IOException {
|
||||
// TODO: Ideally this is where we should check fgQ/bgQ for 'null' and apply defaults...
|
||||
//
|
||||
// we want to walk up the fcontext and inherit the queries from any ancestor SKGAgg
|
||||
// with the same "key" that we have in our own context -- and as a last resort use
|
||||
// "$q" for the foreground and "*:*" for the bgQ (if no ancestors)
|
||||
// (Hmmm... or maybe we should use the "Domain" of our FacetRequest as the default bg?)
|
||||
//
|
||||
// How do we find our what key we have in the current context?
|
||||
// loop over all the stats in the current context until we find one that's '==' to this???
|
||||
|
||||
List<Query> fgFilters = new ArrayList<Query>(3);
|
||||
fgFilters.add(fgQ);
|
||||
for (FacetContext ctx = fcontext; ctx != null; ctx = ctx.parent) {
|
||||
if (null != ctx.filter) {
|
||||
fgFilters.add(ctx.filter);
|
||||
} else {
|
||||
// sanity check...
|
||||
// the only way the filter on the current context should be null is...
|
||||
assert (// 1) it's the actual top most context,
|
||||
// (ie: the func is directly used w/o being nested under a facet)
|
||||
(null == ctx.parent && fcontext == ctx) ||
|
||||
// 2) it's a child of the top most context
|
||||
// (ie: the context of a top level facet)
|
||||
(null == ctx.parent.parent && null == ctx.parent.filter));
|
||||
// either way, no reason to keep looping up the (0 or 1) remaining ancestors
|
||||
// (which is why #1 can assert '&& fcontext == ctx')
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return new SKGSlotAcc(fcontext, numSlots, fgFilters, bgQ);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FacetMerger createFacetMerger(Object prototype) {
|
||||
return new Merger();
|
||||
}
|
||||
|
||||
private static final class SKGSlotAcc extends SlotAcc {
|
||||
private BucketData[] slotvalues;
|
||||
private final DocSet fgSet;
|
||||
private final DocSet bgSet;
|
||||
private final long fgSize;
|
||||
private final long bgSize;
|
||||
private final List<Query> fgFilters;
|
||||
private final Query bgQ;
|
||||
public SKGSlotAcc(FacetContext fcontext, int numSlots,
|
||||
List<Query> fgFilters, Query bgQ) throws IOException {
|
||||
super(fcontext);
|
||||
this.fgFilters = fgFilters;
|
||||
this.bgQ = bgQ;
|
||||
this.fgSet = fcontext.searcher.getDocSet(fgFilters);
|
||||
this.bgSet = fcontext.searcher.getDocSet(bgQ);
|
||||
// cache the set sizes for frequent re-use on every slot
|
||||
this.fgSize = fgSet.size();
|
||||
this.bgSize = bgSet.size();
|
||||
this.slotvalues = new BucketData[numSlots];
|
||||
reset();
|
||||
}
|
||||
|
||||
private void processSlot(int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
|
||||
assert null != slotContext;
|
||||
|
||||
Query slotQ = slotContext.apply(slot).getSlotQuery();
|
||||
if (null == slotQ) {
|
||||
// extremeley special edge case...
|
||||
// the only way this should be possible is if our skg() function is used as a "top level" stat
|
||||
// w/o being nested under any facet, in which case it should be a FacetQuery w/no parent...
|
||||
assert fcontext.processor.freq instanceof FacetQuery : fcontext.processor.freq;
|
||||
assert null == fcontext.parent;
|
||||
assert null == fcontext.filter;
|
||||
}
|
||||
// ...and in which case we should just use the current base
|
||||
final DocSet slotSet = null == slotQ ? fcontext.base : fcontext.searcher.getDocSet(slotQ);
|
||||
|
||||
final BucketData slotVal = new BucketData();
|
||||
slotVal.incSizes(fgSize, bgSize);
|
||||
slotVal.incCounts(fgSet.intersectionSize(slotSet),
|
||||
bgSet.intersectionSize(slotSet));
|
||||
slotvalues[slot] = slotVal;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int perSegDocId, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
// NOTE: we don't actaully care about the individual docs being collected
|
||||
// (the only reason we even bother implementing this method is because it's needed for sorting
|
||||
// buckets by a function)
|
||||
|
||||
// so we only worry about ensuring that every "slot" / bucket is processed the first time
|
||||
// we're asked about it...
|
||||
if (null == slotvalues[slot]) {
|
||||
processSlot(slot, slotContext);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
// NOTE: we don't actaully care about the doc set being collected for the bucket
|
||||
// so we only worry about ensuring that every "slot" / bucket is processed exactly once
|
||||
|
||||
// if we're doing bulk collection, we better not be getting asked to re-use slots
|
||||
assert null == slotvalues[slot];
|
||||
processSlot(slot, slotContext);
|
||||
|
||||
// we don't do any filtering, we collect the whole docset, so return that as out collected count
|
||||
// (as a stat, we're actually required to return this by assertions in FacetFieldProcessor.processStats)
|
||||
return docs.size();
|
||||
}
|
||||
|
||||
public int compare(int slotA, int slotB) {
|
||||
final BucketData a = slotvalues[slotA];
|
||||
final BucketData b = slotvalues[slotB];
|
||||
|
||||
// we initialize & reset() (unused) slotvalues elements to null
|
||||
// but we should never be asked to compare a slot that hasn't been collected...
|
||||
assert null != a;
|
||||
assert null != b;
|
||||
return a.compareTo(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getValue(int slotNum) {
|
||||
BucketData slotVal = slotvalues[slotNum];
|
||||
if (null == slotVal) {
|
||||
// since we haven't been told about any docs for this slot, use a slot w/no counts,
|
||||
// just the known fg/bg sizes. (this is most likely a refinement request for a bucket we dont have)
|
||||
slotVal = new BucketData();
|
||||
slotVal.incSizes(fgSize, bgSize);
|
||||
}
|
||||
|
||||
SimpleOrderedMap res = slotVal.externalize(fcontext.isShard());
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
Arrays.fill(slotvalues, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resize(Resizer resizer) {
|
||||
slotvalues = resizer.resize(slotvalues, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
slotvalues = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encapsulates all data needed for a single bucket/slot
|
||||
*
|
||||
* @see SKGSlotAcc
|
||||
* @see Merger
|
||||
*/
|
||||
private static final class BucketData implements Comparable<BucketData> {
|
||||
|
||||
private long fg_size = 0;
|
||||
private long bg_size = 0;
|
||||
private long fg_count = 0;
|
||||
private long bg_count = 0;
|
||||
private double relatedness = Double.NaN;
|
||||
|
||||
public BucketData() {
|
||||
/* No-Op */
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment both the foreground & background <em>counts</em> for the current bucket, reseting any
|
||||
* derived values that may be cached
|
||||
*/
|
||||
public void incCounts(final long fgInc, final long bgInc) {
|
||||
this.relatedness = Double.NaN;
|
||||
fg_count += fgInc;
|
||||
bg_count += bgInc;
|
||||
}
|
||||
/**
|
||||
* Increment both the foreground & background <em>sizes</em> for the current bucket, reseting any
|
||||
* derived values that may be cached
|
||||
*/
|
||||
public void incSizes(final long fgInc, final long bgInc) {
|
||||
this.relatedness = Double.NaN;
|
||||
fg_size += fgInc;
|
||||
bg_size += bgInc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(this.getClass(), fg_count, bg_count, fg_size, bg_size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (!Objects.equals(this.getClass(), other.getClass())) {
|
||||
return false;
|
||||
}
|
||||
BucketData that = (BucketData)other;
|
||||
// we will most certainly be compared to other buckets of the same Agg instance, so compare counts first
|
||||
return Objects.equals(this.fg_count, that.fg_count)
|
||||
&& Objects.equals(this.bg_count, that.bg_count)
|
||||
&& Objects.equals(this.fg_size, that.fg_size)
|
||||
&& Objects.equals(this.bg_size, that.bg_size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes (and caches) the derived relatedness score for this bucket
|
||||
*/
|
||||
private double getRelatedness() {
|
||||
if (Double.isNaN(this.relatedness)) {
|
||||
this.relatedness = computeRelatedness(this.fg_count, this.fg_size,
|
||||
this.bg_count, this.bg_size);
|
||||
// TODO: add support for a "min_pop" option...
|
||||
//
|
||||
// if min_pop is configured, and either (fg|bg) popularity is lower then that value
|
||||
// then "this.relatedness=-Infinity" so it sorts at the bottom
|
||||
// this logic be ignored on isShard requests -- similar to how shards ignore 'mincount'
|
||||
}
|
||||
return this.relatedness;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(BucketData that) {
|
||||
// TODO: add support for a "sort_val" option...
|
||||
//
|
||||
// default should be "relatedness" but also support "foreground" and "background" ...
|
||||
// either of those should sort by the corrisponding ratio
|
||||
// To do this, we should probably precommpute the ratios in incCounts
|
||||
|
||||
int r = Double.compare(this.getRelatedness(), that.getRelatedness());
|
||||
if (0 == r) {
|
||||
r = Long.compare(this.fg_count, that.fg_count);
|
||||
}
|
||||
if (0 == r) {
|
||||
r = Long.compare(this.bg_count, that.bg_count);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see SlotAcc#getValue
|
||||
* @see Merger#getMergedResult
|
||||
*/
|
||||
public SimpleOrderedMap externalize(final boolean isShardRequest) {
|
||||
SimpleOrderedMap result = new SimpleOrderedMap<Number>();
|
||||
|
||||
if (isShardRequest) {
|
||||
result.add(FG_COUNT, fg_count);
|
||||
result.add(BG_COUNT, bg_count);
|
||||
// NOTE: sizes will be the same for every slot...
|
||||
// TODO: it would be nice to put them directly in the parent facet, instead of every bucket,
|
||||
// in order to reduce the size of the response.
|
||||
result.add(FG_SIZE, fg_size);
|
||||
result.add(BG_SIZE, bg_size);
|
||||
} else {
|
||||
// there's no need to bother computing these when returning results *to* a shard coordinator
|
||||
// only useful to external clients
|
||||
result.add(RELATEDNESS, this.getRelatedness());
|
||||
result.add(FG_POP, roundTo5Digits((double) fg_count / bg_size)); // yes, BACKGROUND size is intentional
|
||||
result.add(BG_POP, roundTo5Digits((double) bg_count / bg_size));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges in the per shard {@link BucketData} output into a unified {@link BucketData}
|
||||
*/
|
||||
private static final class Merger extends FacetSortableMerger {
|
||||
private final BucketData mergedData = new BucketData();
|
||||
|
||||
@Override
|
||||
public void merge(Object facetResult, Context mcontext) {
|
||||
NamedList<Object> shardData = (NamedList<Object>)facetResult;
|
||||
mergedData.incSizes((Long)shardData.remove(FG_SIZE), (Long)shardData.remove(BG_SIZE));
|
||||
mergedData.incCounts((Long)shardData.remove(FG_COUNT), (Long)shardData.remove(BG_COUNT));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(FacetSortableMerger other, FacetRequest.SortDirection direction) {
|
||||
// NOTE: regardless of the SortDirection hint, we want normal comparison of the BucketData
|
||||
|
||||
assert other instanceof Merger;
|
||||
Merger that = (Merger)other;
|
||||
return mergedData.compareTo(that.mergedData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMergedResult() {
|
||||
return mergedData.externalize(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This is an aproximated Z-Score, as described in the "Scoring Semantic Relationships"
|
||||
* section of "<a href="https://arxiv.org/pdf/1609.00464.pdf">The Semantic Knowledge Graph:
|
||||
* A compact, auto-generated model for real-time traversal and ranking of any relationship
|
||||
* within a domain</a>"
|
||||
*
|
||||
* See Also:<ul>
|
||||
* <li><a href="https://s.apache.org/Mfu2">java-user@lucene Message-ID: 449AEB60.4070300@alias-i.com</a></li>
|
||||
* <li><a href="https://lingpipe-blog.com/2006/03/29/interesting-phrase-extraction-binomial-hypothesis-testing-vs-coding-loss/">Phrase Extraction: Binomial Hypothesis Testing vs. Coding Loss</a></li>
|
||||
* </ul>
|
||||
*/
|
||||
// NOTE: javadoc linter freaks out if we try doing those links as '@see <a href=...' tags
|
||||
public static double computeRelatedness(final long fg_count, final long fg_size,
|
||||
final long bg_count, final long bg_size) {
|
||||
final double fg_size_d = (double) fg_size;
|
||||
final double bg_size_d = (double) bg_size;
|
||||
final double bg_prob = (bg_count / bg_size_d);
|
||||
final double num = fg_count - fg_size_d * bg_prob;
|
||||
double denom = Math.sqrt(fg_size_d * bg_prob * (1 - bg_prob));
|
||||
denom = (denom == 0) ? 1e-10 : denom;
|
||||
final double z = num / denom;
|
||||
final double result = 0.2 * sigmoidHelper(z, -80, 50)
|
||||
+ 0.2 * sigmoidHelper(z, -30, 30)
|
||||
+ 0.2 * sigmoidHelper(z, 0, 30)
|
||||
+ 0.2 * sigmoidHelper(z, 30, 30)
|
||||
+ 0.2 * sigmoidHelper(z, 80, 50);
|
||||
return roundTo5Digits(result);
|
||||
|
||||
}
|
||||
/**
|
||||
* Helper function for rounding/truncating relatedness & popularity values to
|
||||
* 5 decimal digits, since these values are all probabilistic more then 5 digits aren't really relevant
|
||||
* and may give a missleading impression of added precision.
|
||||
*/
|
||||
public static double roundTo5Digits(final double val) {
|
||||
return Math.round(val * 1e5) / 1e5;
|
||||
}
|
||||
|
||||
/** A helper function for scaling values */
|
||||
private static double sigmoidHelper(final double x, final double offset, final double scale) {
|
||||
return (x+offset) / (scale + Math.abs(x+offset));
|
||||
}
|
||||
}
|
||||
|
|
@ -23,11 +23,13 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.queries.function.FunctionValues;
|
||||
import org.apache.lucene.queries.function.ValueSource;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
|
@ -65,9 +67,45 @@ public abstract class SlotAcc implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
public abstract void collect(int doc, int slot) throws IOException;
|
||||
/**
|
||||
* @deprecated This method exists only for backcompatibility, developers of new {@link SlotAcc}
|
||||
* implementations should not implement this method, and should instead override
|
||||
* {@link #collect(int,int,IntFunction)}
|
||||
*/
|
||||
@Deprecated
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
throw new UnsupportedOperationException
|
||||
("SlotAcc implementations must implement 'collect(int,int,IntFunction<SlotContext>)' or the (deprecated)"
|
||||
+ "'collect(int,int)'");
|
||||
}
|
||||
|
||||
/**
|
||||
* All subclasses should override this method, for backcompatability the default implementaion
|
||||
* delegates to the (deprecated) {@link #collect(int,int)}
|
||||
*
|
||||
* @param doc Single Segment docId (relative to the current {@link LeafReaderContext} to collect
|
||||
* @param slot The slot number to collect this document in
|
||||
* @param slotContext A callback that can be used for Accumulators that would like additional info
|
||||
* about the current slot -- the {@link IntFunction} is only garunteed to be valid for
|
||||
* the current slot, and the {@link SlotContext} returned is only valid for the duration
|
||||
* of the <code>collect()</code> call.
|
||||
*/
|
||||
@Deprecated
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
collect(doc,slot);
|
||||
}
|
||||
|
||||
public int collect(DocSet docs, int slot) throws IOException {
|
||||
/**
|
||||
* Bulk collection of all documents in a slot. The default implementation calls {@link #collect(int,int,IntFunction)}
|
||||
*
|
||||
* @param docs (global) Documents to collect
|
||||
* @param slot The slot number to collect these documents in
|
||||
* @param slotContext A callback that can be used for Accumulators that would like additional info
|
||||
* about the current slot -- the {@link IntFunction} is only garunteed to be valid for
|
||||
* the current slot, and the {@link SlotContext} returned is only valid for the duration
|
||||
* of the <code>collect()</code> call.
|
||||
*/
|
||||
public int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
int count = 0;
|
||||
SolrIndexSearcher searcher = fcontext.searcher;
|
||||
|
||||
|
@ -94,7 +132,7 @@ public abstract class SlotAcc implements Closeable {
|
|||
setNextReader(ctx);
|
||||
}
|
||||
count++;
|
||||
collect(doc - segBase, slot); // per-seg collectors
|
||||
collect(doc - segBase, slot, slotContext); // per-seg collectors
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
@ -212,6 +250,19 @@ public abstract class SlotAcc implements Closeable {
|
|||
|
||||
} // end class Resizer
|
||||
|
||||
/**
|
||||
* Incapsulates information about the current slot, for Accumulators that may want
|
||||
* additional info during collection.
|
||||
*/
|
||||
public static final class SlotContext {
|
||||
private final Query slotQuery;
|
||||
public SlotContext(Query slotQuery) {
|
||||
this.slotQuery = slotQuery;
|
||||
}
|
||||
public Query getSlotQuery() {
|
||||
return slotQuery;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we should really have a decoupled value provider...
|
||||
|
@ -349,7 +400,7 @@ class SumSlotAcc extends DoubleFuncSlotAcc {
|
|||
super(values, fcontext, numSlots);
|
||||
}
|
||||
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
double val = values.doubleVal(doc); // todo: worth trying to share this value across multiple stats that need it?
|
||||
result[slotNum] += val;
|
||||
}
|
||||
|
@ -361,7 +412,7 @@ class SumsqSlotAcc extends DoubleFuncSlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
double val = values.doubleVal(doc);
|
||||
val = val * val;
|
||||
result[slotNum] += val;
|
||||
|
@ -386,7 +437,7 @@ class AvgSlotAcc extends DoubleFuncSlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
double val = values.doubleVal(doc);
|
||||
if (val != 0 || values.exists(doc)) {
|
||||
result[slotNum] += val;
|
||||
|
@ -479,7 +530,7 @@ class VarianceSlotAcc extends DoubleFuncSlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
double val = values.doubleVal(doc);
|
||||
if (values.exists(doc)) {
|
||||
counts[slot]++;
|
||||
|
@ -541,7 +592,7 @@ class StddevSlotAcc extends DoubleFuncSlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
double val = values.doubleVal(doc);
|
||||
if (values.exists(doc)) {
|
||||
counts[slot]++;
|
||||
|
@ -570,8 +621,9 @@ class CountSlotArrAcc extends CountSlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) { // TODO: count arrays can use fewer bytes based on the number of docs in
|
||||
// the base set (that's the upper bound for single valued) - look at ttf?
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) {
|
||||
// TODO: count arrays can use fewer bytes based on the number of docs in
|
||||
// the base set (that's the upper bound for single valued) - look at ttf?
|
||||
result[slotNum]++;
|
||||
}
|
||||
|
||||
|
@ -615,7 +667,7 @@ class SortSlotAcc extends SlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
// no-op
|
||||
}
|
||||
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.solr.search.DocIterator;
|
|||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.SolrCache;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
import org.apache.solr.uninverting.DocTermOrds;
|
||||
import org.apache.solr.util.TestInjection;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -431,7 +432,8 @@ public class UnInvertedField extends DocTermOrds {
|
|||
if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
|
||||
// handle the biggest terms
|
||||
DocSet intersection = searcher.getDocSet(tt.termQuery, docs);
|
||||
int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex);
|
||||
int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex,
|
||||
slotNum -> { return new SlotContext(tt.termQuery); });
|
||||
countAcc.incrementCount(tt.termNum - startTermIndex, collected);
|
||||
if (collected > 0) {
|
||||
uniqueTerms++;
|
||||
|
@ -493,7 +495,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
if (arrIdx < 0) continue;
|
||||
if (arrIdx >= nTerms) break;
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
processor.collectFirstPhase(segDoc, arrIdx);
|
||||
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
|
||||
}
|
||||
} else {
|
||||
int tnum = 0;
|
||||
|
@ -507,7 +509,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
if (arrIdx >= 0) {
|
||||
if (arrIdx >= nTerms) break;
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
processor.collectFirstPhase(segDoc, arrIdx);
|
||||
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
|
||||
}
|
||||
delta = 0;
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -145,7 +146,7 @@ public class UniqueAgg extends StrAggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
int valuesDocID = docIdSetIterator().docID();
|
||||
if (valuesDocID < doc) {
|
||||
valuesDocID = docIdSetIterator().advance(doc);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiDocValues;
|
||||
|
@ -70,7 +71,7 @@ class UniqueMultiDvSlotAcc extends UniqueSlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
if (subDv.advanceExact(doc)) {
|
||||
|
||||
int segOrd = (int) subDv.nextOrd();
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
@ -50,7 +51,7 @@ class UniqueMultivaluedSlotAcc extends UniqueSlotAcc implements UnInvertedField.
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
bits = arr[slotNum];
|
||||
if (bits == null) {
|
||||
bits = new FixedBitSet(nTerms);
|
||||
|
@ -67,4 +68,4 @@ class UniqueMultivaluedSlotAcc extends UniqueSlotAcc implements UnInvertedField.
|
|||
docToTerm = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiDocValues;
|
||||
|
@ -73,7 +74,7 @@ class UniqueSinglevaluedSlotAcc extends UniqueSlotAcc {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slotNum) throws IOException {
|
||||
public void collect(int doc, int slotNum, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
if (doc > subDv.docID()) {
|
||||
subDv.advance(doc);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,654 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.cloud;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.solr.client.solrj.SolrClient;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.impl.HttpSolrClient;
|
||||
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.search.facet.FacetField;
|
||||
import static org.apache.solr.search.facet.RelatednessAgg.computeRelatedness;
|
||||
import static org.apache.solr.search.facet.RelatednessAgg.roundTo5Digits;
|
||||
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A randomized test of nested facets using the <code>relatedness()</code> function, that asserts the
|
||||
* accuracy the results for all the buckets returned using verification queries of the (expected)
|
||||
* foreground & background queries based on the nested facet terms.
|
||||
* <p>
|
||||
* Note that unlike normal facet "count" verification, using a high limit + overrequest isn't a substitute
|
||||
* for refinement in order to ensure accurate "skg" computation across shards. For that reason, this
|
||||
* tests forces <code>refine: true</code> (unlike {@link TestCloudJSONFacetJoinDomain}) and specifices a
|
||||
* <code>domain: { 'query':'{!v=$back' }</code> for every facet, in order to garuntee that all popularity
|
||||
* & relatedness values returned can be proven with validation requests.
|
||||
* </p>
|
||||
* <p>
|
||||
* (Refinement alone is not enough. Using the background query as the facet domain is neccessary to
|
||||
* prevent situations where a single shardX may return candidate bucket with no child-buckets due to
|
||||
* the normal facet intersections, but when refined on other shardY(s), can produce "high scoring"
|
||||
* SKG child-buckets, which would then be missing the foreground/background "size" contributions from
|
||||
* shardX.
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
*
|
||||
* @see TestCloudJSONFacetJoinDomain
|
||||
*/
|
||||
@Slow
|
||||
public class TestCloudJSONFacetSKG extends SolrCloudTestCase {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
private static final String DEBUG_LABEL = MethodHandles.lookup().lookupClass().getName();
|
||||
private static final String COLLECTION_NAME = DEBUG_LABEL + "_collection";
|
||||
|
||||
private static final int DEFAULT_LIMIT = FacetField.DEFAULT_FACET_LIMIT;
|
||||
private static final int MAX_FIELD_NUM = 15;
|
||||
private static final int UNIQUE_FIELD_VALS = 50;
|
||||
|
||||
/** Multivalued string field suffixes that can be randomized for testing diff facet/join code paths */
|
||||
private static final String[] STR_FIELD_SUFFIXES = new String[] { "_ss", "_sds", "_sdsS" };
|
||||
/** Multivalued int field suffixes that can be randomized for testing diff facet/join code paths */
|
||||
private static final String[] INT_FIELD_SUFFIXES = new String[] { "_is", "_ids", "_idsS" };
|
||||
|
||||
/** A basic client for operations at the cloud level, default collection will be set */
|
||||
private static CloudSolrClient CLOUD_CLIENT;
|
||||
/** One client per node */
|
||||
private static ArrayList<HttpSolrClient> CLIENTS = new ArrayList<>(5);
|
||||
|
||||
@BeforeClass
|
||||
private static void createMiniSolrCloudCluster() throws Exception {
|
||||
// sanity check constants
|
||||
assertTrue("bad test constants: some suffixes will never be tested",
|
||||
(STR_FIELD_SUFFIXES.length < MAX_FIELD_NUM) && (INT_FIELD_SUFFIXES.length < MAX_FIELD_NUM));
|
||||
|
||||
// we need DVs on point fields to compute stats & facets
|
||||
if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true");
|
||||
|
||||
// multi replicas should not matter...
|
||||
final int repFactor = usually() ? 1 : 2;
|
||||
// ... but we definitely want to test multiple shards
|
||||
final int numShards = TestUtil.nextInt(random(), 1, (usually() ? 2 :3));
|
||||
final int numNodes = (numShards * repFactor);
|
||||
|
||||
final String configName = DEBUG_LABEL + "_config-set";
|
||||
final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf");
|
||||
|
||||
configureCluster(numNodes).addConfig(configName, configDir).configure();
|
||||
|
||||
Map<String, String> collectionProperties = new LinkedHashMap<>();
|
||||
collectionProperties.put("config", "solrconfig-tlog.xml");
|
||||
collectionProperties.put("schema", "schema_latest.xml");
|
||||
CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, numShards, repFactor)
|
||||
.setProperties(collectionProperties)
|
||||
.process(cluster.getSolrClient());
|
||||
|
||||
CLOUD_CLIENT = cluster.getSolrClient();
|
||||
CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME);
|
||||
|
||||
waitForRecoveriesToFinish(CLOUD_CLIENT);
|
||||
|
||||
for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
|
||||
CLIENTS.add(getHttpSolrClient(jetty.getBaseUrl() + "/" + COLLECTION_NAME + "/"));
|
||||
}
|
||||
|
||||
final int numDocs = atLeast(100);
|
||||
for (int id = 0; id < numDocs; id++) {
|
||||
SolrInputDocument doc = sdoc("id", ""+id);
|
||||
for (int fieldNum = 0; fieldNum < MAX_FIELD_NUM; fieldNum++) {
|
||||
// NOTE: some docs may have no value in a field
|
||||
final int numValsThisDoc = TestUtil.nextInt(random(), 0, (usually() ? 5 : 10));
|
||||
for (int v = 0; v < numValsThisDoc; v++) {
|
||||
final String fieldValue = randFieldValue(fieldNum);
|
||||
|
||||
// for each fieldNum, there are actaully two fields: one string, and one integer
|
||||
doc.addField(field(STR_FIELD_SUFFIXES, fieldNum), fieldValue);
|
||||
doc.addField(field(INT_FIELD_SUFFIXES, fieldNum), fieldValue);
|
||||
}
|
||||
}
|
||||
CLOUD_CLIENT.add(doc);
|
||||
if (random().nextInt(100) < 1) {
|
||||
CLOUD_CLIENT.commit(); // commit 1% of the time to create new segments
|
||||
}
|
||||
if (random().nextInt(100) < 5) {
|
||||
CLOUD_CLIENT.add(doc); // duplicate the doc 5% of the time to create deleted docs
|
||||
}
|
||||
}
|
||||
CLOUD_CLIENT.commit();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a (random) number, and a (static) array of possible suffixes returns a consistent field name that
|
||||
* uses that number and one of hte specified suffixes in it's name.
|
||||
*
|
||||
* @see #STR_FIELD_SUFFIXES
|
||||
* @see #INT_FIELD_SUFFIXES
|
||||
* @see #MAX_FIELD_NUM
|
||||
* @see #randFieldValue
|
||||
*/
|
||||
private static String field(final String[] suffixes, final int fieldNum) {
|
||||
assert fieldNum < MAX_FIELD_NUM;
|
||||
|
||||
final String suffix = suffixes[fieldNum % suffixes.length];
|
||||
return "field_" + fieldNum + suffix;
|
||||
}
|
||||
private static String strfield(final int fieldNum) {
|
||||
return field(STR_FIELD_SUFFIXES, fieldNum);
|
||||
}
|
||||
private static String intfield(final int fieldNum) {
|
||||
return field(INT_FIELD_SUFFIXES, fieldNum);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a (random) field number, returns a random (integer based) value for that field.
|
||||
* NOTE: The number of unique values in each field is constant acording to {@link #UNIQUE_FIELD_VALS}
|
||||
* but the precise <em>range</em> of values will vary for each unique field number, such that cross field joins
|
||||
* will match fewer documents based on how far apart the field numbers are.
|
||||
*
|
||||
* @see #UNIQUE_FIELD_VALS
|
||||
* @see #field
|
||||
*/
|
||||
private static String randFieldValue(final int fieldNum) {
|
||||
return "" + (fieldNum + TestUtil.nextInt(random(), 1, UNIQUE_FIELD_VALS));
|
||||
}
|
||||
|
||||
|
||||
@AfterClass
|
||||
private static void afterClass() throws Exception {
|
||||
CLOUD_CLIENT.close(); CLOUD_CLIENT = null;
|
||||
for (HttpSolrClient client : CLIENTS) {
|
||||
client.close();
|
||||
}
|
||||
CLIENTS = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test some small, hand crafted, but non-trivial queries that are
|
||||
* easier to trace/debug then a pure random monstrosity.
|
||||
* (ie: if something obvious gets broken, this test may fail faster and in a more obvious way then testRandom)
|
||||
*/
|
||||
public void testBespoke() throws Exception {
|
||||
{ // trivial single level facet
|
||||
Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
TermFacet top = new TermFacet(strfield(9), UNIQUE_FIELD_VALS, 0, null);
|
||||
facets.put("top1", top);
|
||||
final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS);
|
||||
assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*");
|
||||
assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS);
|
||||
}
|
||||
|
||||
{ // trivial single level facet w/sorting on skg
|
||||
Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
TermFacet top = new TermFacet(strfield(9), UNIQUE_FIELD_VALS, 0, "skg desc");
|
||||
facets.put("top2", top);
|
||||
final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS);
|
||||
assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*");
|
||||
assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS);
|
||||
}
|
||||
|
||||
{ // trivial single level facet w/ 2 diff ways to request "limit = (effectively) Infinite"
|
||||
// to sanity check refinement of buckets missing from other shard in both cases
|
||||
|
||||
// NOTE that these two queries & facets *should* effectively identical given that the
|
||||
// very large limit value is big enough no shard will ever return that may terms,
|
||||
// but the "limit=-1" case it actaully triggers slightly different code paths
|
||||
// because it causes FacetField.returnsPartial() to be "true"
|
||||
for (int limit : new int[] { 999999999, -1 }) {
|
||||
Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
facets.put("top_facet_limit__" + limit, new TermFacet(strfield(9), limit, 0, "skg desc"));
|
||||
final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS);
|
||||
assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*");
|
||||
assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
|
||||
// since the "cost" of verifying the stats for each bucket is so high (see TODO in verifySKGResults())
|
||||
// we put a safety valve in place on the maximum number of buckets that we are willing to verify
|
||||
// across *all* the queries that we do.
|
||||
// that way if the randomized queries we build all have relatively small facets, so be it, but if
|
||||
// we get a really big one early on, we can test as much as possible, skip other iterations.
|
||||
//
|
||||
// (deeply nested facets may contain more buckets then the max, but we won't *check* all of them)
|
||||
final int maxBucketsAllowed = atLeast(2000);
|
||||
final AtomicInteger maxBucketsToCheck = new AtomicInteger(maxBucketsAllowed);
|
||||
|
||||
final int numIters = atLeast(10);
|
||||
for (int iter = 0; iter < numIters && 0 < maxBucketsToCheck.get(); iter++) {
|
||||
|
||||
assertFacetSKGsAreCorrect(maxBucketsToCheck, TermFacet.buildRandomFacets(),
|
||||
buildRandomQuery(), buildRandomQuery(), buildRandomQuery());
|
||||
}
|
||||
assertTrue("Didn't check a single bucket???", maxBucketsToCheck.get() < maxBucketsAllowed);
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a random query string across the randomized fields/values in the index
|
||||
*
|
||||
* @see #randFieldValue
|
||||
* @see #field
|
||||
*/
|
||||
private static String buildRandomQuery() {
|
||||
if (0 == TestUtil.nextInt(random(), 0,10)) {
|
||||
return "*:*";
|
||||
}
|
||||
final int numClauses = TestUtil.nextInt(random(), 3, 10);
|
||||
final String[] clauses = new String[numClauses];
|
||||
for (int c = 0; c < numClauses; c++) {
|
||||
final int fieldNum = random().nextInt(MAX_FIELD_NUM);
|
||||
// keep queries simple, just use str fields - not point of test
|
||||
clauses[c] = strfield(fieldNum) + ":" + randFieldValue(fieldNum);
|
||||
}
|
||||
return buildORQuery(clauses);
|
||||
}
|
||||
|
||||
private static String buildORQuery(String... clauses) {
|
||||
assert 0 < clauses.length;
|
||||
return "(" + StringUtils.join(clauses, " OR ") + ")";
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a set of term facets, and top level query strings, asserts that
|
||||
* the SKG stats for each facet term returned when executing that query with those foreground/background
|
||||
* queries match the expected results of executing the equivilent queries in isolation.
|
||||
*
|
||||
* @see #verifySKGResults
|
||||
*/
|
||||
private void assertFacetSKGsAreCorrect(final AtomicInteger maxBucketsToCheck,
|
||||
Map<String,TermFacet> expected,
|
||||
final String query,
|
||||
final String foreQ,
|
||||
final String backQ) throws SolrServerException, IOException {
|
||||
final SolrParams baseParams = params("rows","0", "fore", foreQ, "back", backQ);
|
||||
|
||||
final SolrParams facetParams = params("q", query,
|
||||
"json.facet", ""+TermFacet.toJSONFacetParamValue(expected,null));
|
||||
final SolrParams initParams = SolrParams.wrapAppended(facetParams, baseParams);
|
||||
|
||||
log.info("Doing full run: {}", initParams);
|
||||
|
||||
QueryResponse rsp = null;
|
||||
// JSON Facets not (currently) available from QueryResponse...
|
||||
NamedList topNamedList = null;
|
||||
try {
|
||||
rsp = (new QueryRequest(initParams)).process(getRandClient(random()));
|
||||
assertNotNull(initParams + " is null rsp?", rsp);
|
||||
topNamedList = rsp.getResponse();
|
||||
assertNotNull(initParams + " is null topNamedList?", topNamedList);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("init query failed: " + initParams + ": " +
|
||||
e.getMessage(), e);
|
||||
}
|
||||
try {
|
||||
final NamedList facetResponse = (NamedList) topNamedList.get("facets");
|
||||
assertNotNull("null facet results?", facetResponse);
|
||||
assertEquals("numFound mismatch with top count?",
|
||||
rsp.getResults().getNumFound(), ((Number)facetResponse.get("count")).longValue());
|
||||
if (0 == rsp.getResults().getNumFound()) {
|
||||
// when the query matches nothing, we should expect no top level facets
|
||||
expected = Collections.emptyMap();
|
||||
}
|
||||
assertFacetSKGsAreCorrect(maxBucketsToCheck, expected, baseParams, facetResponse);
|
||||
} catch (AssertionError e) {
|
||||
throw new AssertionError(initParams + " ===> " + topNamedList + " --> " + e.getMessage(), e);
|
||||
} finally {
|
||||
log.info("Ending full run");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive helper method that walks the actual facet response, comparing the SKG results to
|
||||
* the expected output based on the equivilent filters generated from the original TermFacet.
|
||||
*/
|
||||
private void assertFacetSKGsAreCorrect(final AtomicInteger maxBucketsToCheck,
|
||||
final Map<String,TermFacet> expected,
|
||||
final SolrParams baseParams,
|
||||
final NamedList actualFacetResponse) throws SolrServerException, IOException {
|
||||
|
||||
for (Map.Entry<String,TermFacet> entry : expected.entrySet()) {
|
||||
final String facetKey = entry.getKey();
|
||||
final TermFacet facet = entry.getValue();
|
||||
final NamedList results = (NamedList) actualFacetResponse.get(facetKey);
|
||||
assertNotNull(facetKey + " key missing from: " + actualFacetResponse, results);
|
||||
final List<NamedList> buckets = (List<NamedList>) results.get("buckets");
|
||||
assertNotNull(facetKey + " has null buckets: " + actualFacetResponse, buckets);
|
||||
|
||||
if (buckets.isEmpty()) {
|
||||
// should only happen if the background query does not match any docs with field X
|
||||
final long docsWithField = getNumFound(params("_trace", "noBuckets",
|
||||
"rows", "0",
|
||||
"q", facet.field+":[* TO *]",
|
||||
"fq", baseParams.get("back")));
|
||||
|
||||
assertEquals(facetKey + " has no buckets, but docs in background exist with field: " + facet.field,
|
||||
0, docsWithField);
|
||||
}
|
||||
|
||||
// NOTE: it's important that we do this depth first -- not just because it's the easiest way to do it,
|
||||
// but because it means that our maxBucketsToCheck will ensure we do a lot of deep sub-bucket checking,
|
||||
// not just all the buckets of the top level(s) facet(s)
|
||||
for (NamedList bucket : buckets) {
|
||||
final String fieldVal = bucket.get("val").toString(); // int or stringified int
|
||||
|
||||
verifySKGResults(facetKey, facet, baseParams, fieldVal, bucket);
|
||||
if (maxBucketsToCheck.decrementAndGet() <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
final SolrParams verifyParams = SolrParams.wrapAppended(baseParams,
|
||||
params("fq", facet.field + ":" + fieldVal));
|
||||
|
||||
// recursively check subFacets
|
||||
if (! facet.subFacets.isEmpty()) {
|
||||
assertFacetSKGsAreCorrect(maxBucketsToCheck, facet.subFacets, verifyParams, bucket);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that the popularity & relatedness values containined in a single SKG bucket
|
||||
* match the expected values based on the facet field & bucket value, as well the existing
|
||||
* filterParams.
|
||||
*
|
||||
* @see #assertFacetSKGsAreCorrect
|
||||
*/
|
||||
private void verifySKGResults(String facetKey, TermFacet facet, SolrParams filterParams,
|
||||
String fieldVal, NamedList<Object> bucket)
|
||||
throws SolrServerException, IOException {
|
||||
|
||||
final String bucketQ = facet.field+":"+fieldVal;
|
||||
final NamedList<Object> skgBucket = (NamedList<Object>) bucket.get("skg");
|
||||
assertNotNull(facetKey + "/bucket:" + bucket.toString(), skgBucket);
|
||||
|
||||
// TODO: make this more efficient?
|
||||
// ideally we'd do a single query w/4 facet.queries, one for each count
|
||||
// but formatting the queries is a pain, currently we leverage the accumulated fq's
|
||||
final long fgSize = getNumFound(SolrParams.wrapAppended(params("_trace", "fgSize",
|
||||
"rows","0",
|
||||
"q","{!query v=$fore}"),
|
||||
filterParams));
|
||||
final long bgSize = getNumFound(params("_trace", "bgSize",
|
||||
"rows","0",
|
||||
"q", filterParams.get("back")));
|
||||
|
||||
final long fgCount = getNumFound(SolrParams.wrapAppended(params("_trace", "fgCount",
|
||||
"rows","0",
|
||||
"q","{!query v=$fore}",
|
||||
"fq", bucketQ),
|
||||
filterParams));
|
||||
final long bgCount = getNumFound(params("_trace", "bgCount",
|
||||
"rows","0",
|
||||
"q", bucketQ,
|
||||
"fq", filterParams.get("back")));
|
||||
|
||||
assertEquals(facetKey + "/bucket:" + bucket + " => fgPop should be: " + fgCount + " / " + bgSize,
|
||||
roundTo5Digits((double) fgCount / bgSize),
|
||||
skgBucket.get("foreground_popularity"));
|
||||
assertEquals(facetKey + "/bucket:" + bucket + " => bgPop should be: " + bgCount + " / " + bgSize,
|
||||
roundTo5Digits((double) bgCount / bgSize),
|
||||
skgBucket.get("background_popularity"));
|
||||
assertEquals(facetKey + "/bucket:" + bucket + " => relatedness is wrong",
|
||||
roundTo5Digits(computeRelatedness(fgCount, fgSize, bgCount, bgSize)),
|
||||
skgBucket.get("relatedness"));
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Trivial data structure for modeling a simple terms facet that can be written out as a json.facet param.
|
||||
*
|
||||
* Doesn't do any string escaping or quoting, so don't use whitespace or reserved json characters
|
||||
*/
|
||||
private static final class TermFacet {
|
||||
public final String field;
|
||||
public final Map<String,TermFacet> subFacets = new LinkedHashMap<>();
|
||||
public final Integer limit; // may be null
|
||||
public final Integer overrequest; // may be null
|
||||
public final String sort; // may be null
|
||||
/** Simplified constructor asks for limit = # unique vals */
|
||||
public TermFacet(String field) {
|
||||
this(field, UNIQUE_FIELD_VALS, 0, "skg desc");
|
||||
|
||||
}
|
||||
public TermFacet(String field, Integer limit, Integer overrequest, String sort) {
|
||||
assert null != field;
|
||||
this.field = field;
|
||||
this.limit = limit;
|
||||
this.overrequest = overrequest;
|
||||
this.sort = sort;
|
||||
}
|
||||
|
||||
/**
|
||||
* recursively generates the <code>json.facet</code> param value to use for testing this facet
|
||||
*/
|
||||
private CharSequence toJSONFacetParamValue() {
|
||||
final String limitStr = (null == limit) ? "" : (", limit:" + limit);
|
||||
final String overrequestStr = (null == overrequest) ? "" : (", overrequest:" + overrequest);
|
||||
final String sortStr = (null == sort) ? "" : (", sort: '" + sort + "'");
|
||||
final StringBuilder sb
|
||||
= new StringBuilder("{ type:terms, field:" + field + limitStr + overrequestStr + sortStr);
|
||||
|
||||
// see class javadocs for why we always use refine:true & the query:$back domain for this test.
|
||||
sb.append(", refine: true, domain: { query: '{!v=$back}' }, facet:");
|
||||
sb.append(toJSONFacetParamValue(subFacets, "skg : 'relatedness($fore,$back)'"));
|
||||
sb.append("}");
|
||||
return sb;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a set of (possibly nested) facets, generates a suitable <code>json.facet</code> param value to
|
||||
* use for testing them against in a solr request.
|
||||
*/
|
||||
public static CharSequence toJSONFacetParamValue(final Map<String,TermFacet> facets,
|
||||
final String extraJson) {
|
||||
assert null != facets;
|
||||
if (0 == facets.size() && null == extraJson) {
|
||||
return "";
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder("{ processEmpty: true, ");
|
||||
for (String key : facets.keySet()) {
|
||||
sb.append(key).append(" : ").append(facets.get(key).toJSONFacetParamValue());
|
||||
sb.append(" ,");
|
||||
}
|
||||
if (null == extraJson) {
|
||||
sb.setLength(sb.length() - 1);
|
||||
} else {
|
||||
sb.append(extraJson);
|
||||
}
|
||||
sb.append("}");
|
||||
return sb;
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method for generating some random facets.
|
||||
*
|
||||
* For simplicity, each facet will have a unique key name.
|
||||
*/
|
||||
public static Map<String,TermFacet> buildRandomFacets() {
|
||||
// for simplicity, use a unique facet key regardless of depth - simplifies verification
|
||||
// and le's us enforce a hard limit on the total number of facets in a request
|
||||
AtomicInteger keyCounter = new AtomicInteger(0);
|
||||
|
||||
final int maxDepth = TestUtil.nextInt(random(), 0, (usually() ? 2 : 3));
|
||||
return buildRandomFacets(keyCounter, maxDepth);
|
||||
}
|
||||
|
||||
/**
|
||||
* picks a random value for the "sort" param, biased in favor of interesting test cases
|
||||
*
|
||||
* @return a sort string (w/direction), or null to specify nothing (trigger default behavior)
|
||||
* @see #randomLimitParam
|
||||
*/
|
||||
public static String randomSortParam(Random r) {
|
||||
|
||||
// IMPORTANT!!!
|
||||
// if this method is modified to produce new sorts, make sure to update
|
||||
// randomLimitParam to account for them if they are impacted by SOLR-12343
|
||||
final String dir = random().nextBoolean() ? "asc" : "desc";
|
||||
switch(r.nextInt(4)) {
|
||||
case 0: return null;
|
||||
case 1: return "count " + dir;
|
||||
case 2: return "skg " + dir;
|
||||
case 3: return "index " + dir;
|
||||
default: throw new RuntimeException("Broken case statement");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* picks a random value for the "limit" param, biased in favor of interesting test cases
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE:</b> Due to SOLR-12343, we have to force an overrequest of "all" possible terms for
|
||||
* some sort values.
|
||||
* </p>
|
||||
*
|
||||
* @return a number to specify in the request, or null to specify nothing (trigger default behavior)
|
||||
* @see #UNIQUE_FIELD_VALS
|
||||
* @see #randomSortParam
|
||||
*/
|
||||
public static Integer randomLimitParam(Random r, final String sort) {
|
||||
if (null != sort) {
|
||||
if (sort.equals("count asc") || sort.startsWith("skg")) {
|
||||
// of the known types of sorts produced, these are at risk of SOLR-12343
|
||||
// so request (effectively) unlimited num buckets
|
||||
return r.nextBoolean() ? UNIQUE_FIELD_VALS : -1;
|
||||
}
|
||||
}
|
||||
final int limit = 1 + r.nextInt((int) (UNIQUE_FIELD_VALS * 1.5F));
|
||||
if (limit >= UNIQUE_FIELD_VALS && r.nextBoolean()) {
|
||||
return -1; // unlimited
|
||||
} else if (limit == DEFAULT_LIMIT && r.nextBoolean()) {
|
||||
return null; // sometimes, don't specify limit if it's the default
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* picks a random value for the "overrequest" param, biased in favor of interesting test cases.
|
||||
*
|
||||
* @return a number to specify in the request, or null to specify nothing (trigger default behavior)
|
||||
* @see #UNIQUE_FIELD_VALS
|
||||
*/
|
||||
public static Integer randomOverrequestParam(Random r) {
|
||||
switch(r.nextInt(10)) {
|
||||
case 0:
|
||||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
return 0; // 40% of the time, disable overrequest to better stress refinement
|
||||
case 4:
|
||||
case 5:
|
||||
return r.nextInt(UNIQUE_FIELD_VALS); // 20% ask for less them what's needed
|
||||
case 6:
|
||||
return r.nextInt(Integer.MAX_VALUE); // 10%: completley random value, statisticaly more then enough
|
||||
default: break;
|
||||
}
|
||||
// else.... either leave param unspecified (or redundently specify the -1 default)
|
||||
return r.nextBoolean() ? null : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* recursive helper method for building random facets
|
||||
*
|
||||
* @param keyCounter used to ensure every generated facet has a unique key name
|
||||
* @param maxDepth max possible depth allowed for the recusion, a lower value may be used depending on how many facets are returned at the current level.
|
||||
*/
|
||||
private static Map<String,TermFacet> buildRandomFacets(AtomicInteger keyCounter, int maxDepth) {
|
||||
final int numFacets = Math.max(1, TestUtil.nextInt(random(), -1, 3)); // 3/5th chance of being '1'
|
||||
Map<String,TermFacet> results = new LinkedHashMap<>();
|
||||
for (int i = 0; i < numFacets; i++) {
|
||||
if (keyCounter.get() < 3) { // a hard limit on the total number of facets (regardless of depth) to reduce OOM risk
|
||||
|
||||
final String sort = randomSortParam(random());
|
||||
final Integer limit = randomLimitParam(random(), sort);
|
||||
final Integer overrequest = randomOverrequestParam(random());
|
||||
final TermFacet facet = new TermFacet(field((random().nextBoolean()
|
||||
? STR_FIELD_SUFFIXES : INT_FIELD_SUFFIXES),
|
||||
random().nextInt(MAX_FIELD_NUM)),
|
||||
limit, overrequest, sort);
|
||||
results.put("facet_" + keyCounter.incrementAndGet(), facet);
|
||||
if (0 < maxDepth) {
|
||||
// if we're going wide, don't go deep
|
||||
final int nextMaxDepth = Math.max(0, maxDepth - numFacets);
|
||||
facet.subFacets.putAll(buildRandomFacets(keyCounter, TestUtil.nextInt(random(), 0, nextMaxDepth)));
|
||||
}
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* returns a random SolrClient -- either a CloudSolrClient, or an HttpSolrClient pointed
|
||||
* at a node in our cluster
|
||||
*/
|
||||
public static SolrClient getRandClient(Random rand) {
|
||||
int numClients = CLIENTS.size();
|
||||
int idx = TestUtil.nextInt(rand, 0, numClients);
|
||||
|
||||
return (idx == numClients) ? CLOUD_CLIENT : CLIENTS.get(idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses a random SolrClient to execture a request and returns only the numFound
|
||||
* @see #getRandClient
|
||||
*/
|
||||
public static long getNumFound(final SolrParams req) throws SolrServerException, IOException {
|
||||
return getRandClient(random()).query(req).getResults().getNumFound();
|
||||
}
|
||||
|
||||
public static void waitForRecoveriesToFinish(CloudSolrClient client) throws Exception {
|
||||
assert null != client.getDefaultCollection();
|
||||
AbstractDistribZkTestBase.waitForRecoveriesToFinish(client.getDefaultCollection(),
|
||||
client.getZkStateReader(),
|
||||
true, true, 330);
|
||||
}
|
||||
|
||||
}
|
|
@ -1015,6 +1015,16 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
|
|||
"currency(amount,USD)",
|
||||
"currency('amount',USD)");
|
||||
}
|
||||
public void testFuncRelatedness() throws Exception {
|
||||
SolrQueryRequest req = req("fore","foo_s:front", "back","foo_s:back");
|
||||
try {
|
||||
assertFuncEquals(req,
|
||||
"agg_relatedness({!query v='foo_s:front'}, {!query v='foo_s:back'})",
|
||||
"agg_relatedness($fore, $back)");
|
||||
} finally {
|
||||
req.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testTestFuncs() throws Exception {
|
||||
assertFuncEquals("sleep(1,5)", "sleep(1,5)");
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.search.facet;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.function.IntFunction;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.queries.function.ValueSource;
|
||||
|
@ -88,8 +89,8 @@ public class DebugAgg extends AggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc, int slot) throws IOException {
|
||||
sub.collect(doc, slot);
|
||||
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
sub.collect(doc, slot, slotContext);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -126,8 +127,8 @@ public class DebugAgg extends AggValueSource {
|
|||
}
|
||||
|
||||
@Override
|
||||
public int collect(DocSet docs, int slot) throws IOException {
|
||||
return sub.collect(docs, slot);
|
||||
public int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
return sub.collect(docs, slot, slotContext);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -373,7 +373,10 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
|
|||
NamedList<NamedList> all_facets = (NamedList) queryServer
|
||||
( params( "q", "*:*", "shards", getShardsString(), "rows" , "0", "json.facet",
|
||||
"{ foo : { " + commonJson + " field: foo_s, facet: { " +
|
||||
ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON + "} } } } }"
|
||||
ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON +
|
||||
// under bar, in addition to "ALL" simple stats, we also ask for skg...
|
||||
", skg : 'relatedness($skg_fore,$skg_back)' } } } } }",
|
||||
"skg_fore", STAT_FIELD+":[0 TO 40]", "skg_back", STAT_FIELD+":[-10000 TO 10000]"
|
||||
) ).getResponse().get("facets");
|
||||
|
||||
assertNotNull(all_facets);
|
||||
|
@ -411,7 +414,7 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
|
|||
List<NamedList> tail_bar_buckets = (List) ((NamedList)tail_Bucket.get("bar")).get("buckets");
|
||||
|
||||
NamedList tailB_Bucket = tail_bar_buckets.get(0);
|
||||
assertEquals(ALL_STATS.size() + 2, tailB_Bucket.size()); // val,count ... NO SUB FACETS
|
||||
assertEquals(ALL_STATS.size() + 3, tailB_Bucket.size()); // val,count,skg ... NO SUB FACETS
|
||||
assertEquals("tailB", tailB_Bucket.get("val"));
|
||||
assertEquals(17L, tailB_Bucket.get("count"));
|
||||
assertEquals(35L, tailB_Bucket.get("min"));
|
||||
|
@ -423,6 +426,18 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute
|
|||
assertEquals(16910.0D, (double) tailB_Bucket.get("sumsq"), 0.1E-7);
|
||||
// assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
|
||||
assertEquals(1.70782513D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
|
||||
|
||||
// check the SKG stats on our tailB bucket
|
||||
NamedList tailB_skg = (NamedList) tailB_Bucket.get("skg");
|
||||
assertEquals(tailB_skg.toString(),
|
||||
3, tailB_skg.size());
|
||||
assertEquals(0.19990D, tailB_skg.get("relatedness"));
|
||||
assertEquals(0.00334D, tailB_skg.get("foreground_popularity"));
|
||||
assertEquals(0.00334D, tailB_skg.get("background_popularity"));
|
||||
//assertEquals(12L, tailB_skg.get("foreground_count"));
|
||||
//assertEquals(82L, tailB_skg.get("foreground_size"));
|
||||
//assertEquals(12L, tailB_skg.get("background_count"));
|
||||
//assertEquals(3591L, tailB_skg.get("background_size"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -407,6 +407,75 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS {
|
|||
"}"
|
||||
);
|
||||
|
||||
// test that SKG stat reflects merged refinement
|
||||
client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]",
|
||||
"json.facet", "{"
|
||||
+ " cat0:{ ${terms} type:terms, field: ${cat_s}, "
|
||||
+ " sort:'count desc', limit:1, overrequest:0, refine:true, "
|
||||
+ " facet:{ s:'relatedness($fore,$back)'} } }")
|
||||
, "facets=={ count:8, cat0:{ buckets:[ "
|
||||
+ " { val:A, count:4, "
|
||||
+ " s : { relatedness: 0.00496, "
|
||||
//+ " foreground_count: 3, "
|
||||
//+ " foreground_size: 5, "
|
||||
//+ " background_count: 2, "
|
||||
//+ " background_size: 4, "
|
||||
+ " foreground_popularity: 0.75, "
|
||||
+ " background_popularity: 0.5, "
|
||||
+ " } } ] }" +
|
||||
"}"
|
||||
);
|
||||
|
||||
// SKG under nested facet where some terms only exist on one shard
|
||||
{
|
||||
// sub-bucket order should change as sort direction changes
|
||||
final String jsonFacet = ""
|
||||
+ "{ processEmpty:true, "
|
||||
+ " cat0:{ ${terms} type:terms, field: ${cat_s}, "
|
||||
+ " sort:'count desc', limit:1, overrequest:0, refine:true, "
|
||||
+ " facet:{ processEmpty:true, "
|
||||
+ " qw1: { ${terms} type:terms, field: ${qw_s}, mincount:0, "
|
||||
+ " sort:'${skg_sort}', limit:100, overrequest:0, refine:true, "
|
||||
+ " facet:{ processEmpty:true, skg:'relatedness($fore,$back)' } } } } }";
|
||||
final String bucketQ = ""
|
||||
+ " { val:Q, count:1, "
|
||||
+ " skg : { relatedness: 1.0, "
|
||||
+ " foreground_popularity: 0.25, "
|
||||
+ " background_popularity: 0.0, "
|
||||
// + " foreground_count: 1, "
|
||||
// + " foreground_size: 3, "
|
||||
// + " background_count: 0, "
|
||||
// + " background_size: 4, "
|
||||
+ " } },";
|
||||
final String bucketW = ""
|
||||
+ " { val:W, count:1, "
|
||||
+ " skg : { relatedness: 0.0037, "
|
||||
+ " foreground_popularity: 0.25, "
|
||||
+ " background_popularity: 0.25, "
|
||||
// + " foreground_count: 1, "
|
||||
// + " foreground_size: 3, "
|
||||
// + " background_count: 1, "
|
||||
// + " background_size: 4, "
|
||||
+ " } },";
|
||||
|
||||
client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]",
|
||||
"skg_sort", "skg desc", "json.facet", jsonFacet)
|
||||
, "facets=={ count:8, cat0:{ buckets:[ "
|
||||
+ " { val:A, count:4, "
|
||||
+ " qw1 : { buckets:["
|
||||
+ bucketQ
|
||||
+ bucketW
|
||||
+ " ] } } ] } }");
|
||||
client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]",
|
||||
"skg_sort", "skg asc", "json.facet", jsonFacet)
|
||||
, "facets=={ count:8, cat0:{ buckets:[ "
|
||||
+ " { val:A, count:4, "
|
||||
+ " qw1 : { buckets:["
|
||||
+ bucketW
|
||||
+ bucketQ
|
||||
+ " ] } } ] } }");
|
||||
}
|
||||
|
||||
// test partial buckets (field facet within field facet)
|
||||
client.testJQ(params(p, "q", "*:*",
|
||||
"json.facet", "{" +
|
||||
|
|
|
@ -212,6 +212,245 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
|||
client.commit();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExplicitQueryDomain() throws Exception {
|
||||
Client client = Client.localClient();
|
||||
indexSimple(client);
|
||||
|
||||
{ // simple 'query' domain
|
||||
|
||||
// the facet buckets for all of the requests below should be identical
|
||||
// only the numFound & top level facet count should differ
|
||||
final String expectedFacets
|
||||
= "facets/w=={ buckets:["
|
||||
+ " { val:'NJ', count:2}, "
|
||||
+ " { val:'NY', count:1} ] }";
|
||||
|
||||
assertJQ(req("rows", "0", "q", "cat_s:B", "json.facet",
|
||||
"{w: {type:terms, field:'where_s'}}"),
|
||||
"response/numFound==3",
|
||||
"facets/count==3",
|
||||
expectedFacets);
|
||||
assertJQ(req("rows", "0", "q", "id:3", "json.facet",
|
||||
"{w: {type:terms, field:'where_s', domain: { query:'cat_s:B' }}}"),
|
||||
"response/numFound==1",
|
||||
"facets/count==1",
|
||||
expectedFacets);
|
||||
assertJQ(req("rows", "0", "q", "*:*", "fq", "-*:*", "json.facet",
|
||||
"{w: {type:terms, field:'where_s', domain: { query:'cat_s:B' }}}"),
|
||||
"response/numFound==0",
|
||||
"facets/count==0",
|
||||
expectedFacets);
|
||||
assertJQ(req("rows", "0", "q", "*:*", "fq", "-*:*", "domain_q", "cat_s:B", "json.facet",
|
||||
"{w: {type:terms, field:'where_s', domain: { query:{param:domain_q} }}}"),
|
||||
"response/numFound==0",
|
||||
"facets/count==0",
|
||||
expectedFacets);
|
||||
}
|
||||
|
||||
{ // a nested explicit query domain
|
||||
|
||||
// for all of the "top" buckets, the subfacet should have identical sub-buckets
|
||||
final String expectedSubBuckets = "{ buckets:[ { val:'B', count:3}, { val:'A', count:2} ] }";
|
||||
assertJQ(req("rows", "0", "q", "num_i:[0 TO *]", "json.facet",
|
||||
"{w: {type:terms, field:'where_s', " +
|
||||
" facet: { c: { type:terms, field:'cat_s', domain: { query:'*:*' }}}}}")
|
||||
, "facets/w=={ buckets:["
|
||||
+ " { val:'NJ', count:2, c: " + expectedSubBuckets + "}, "
|
||||
+ " { val:'NY', count:1, c: " + expectedSubBuckets + "} "
|
||||
+ "] }"
|
||||
);
|
||||
}
|
||||
|
||||
{ // an (effectively) empty query should produce an error
|
||||
ignoreException("'query' domain can not be null");
|
||||
ignoreException("'query' domain must not evaluate to an empty list");
|
||||
for (String raw : Arrays.asList("null", "[ ]", "{param:bogus}")) {
|
||||
expectThrows(SolrException.class, () -> {
|
||||
assertJQ(req("rows", "0", "q", "num_i:[0 TO *]", "json.facet",
|
||||
"{w: {type:terms, field:'where_s', " +
|
||||
" facet: { c: { type:terms, field:'cat_s', domain: { query: "+raw+" }}}}}"));
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testSimpleSKG() throws Exception {
|
||||
Client client = Client.localClient();
|
||||
indexSimple(client);
|
||||
|
||||
// using relatedness() as a top level stat, not nested under any facet
|
||||
// (not particularly useful, but shouldn't error either)
|
||||
assertJQ(req("q", "cat_s:[* TO *]", "rows", "0",
|
||||
"fore", "where_s:NY", "back", "*:*",
|
||||
"json.facet", " { skg: 'relatedness($fore,$back)' }")
|
||||
, "facets=={"
|
||||
+ " count:5, "
|
||||
+ " skg : { relatedness: 0.00699,"
|
||||
+ " foreground_popularity: 0.33333,"
|
||||
+ " background_popularity: 0.83333,"
|
||||
+ " } }"
|
||||
);
|
||||
|
||||
// simple single level facet w/skg stat & sorting
|
||||
for (String sort : Arrays.asList("index asc", "skg desc")) {
|
||||
// the relatedness score of each of our cat_s values is (conviniently) also alphabetical order
|
||||
// so both of these sort options should produce identical output
|
||||
// and testinging "index" sort allows the randomized use of "stream" processor as default to be tested
|
||||
assertJQ(req("q", "cat_s:[* TO *]", "rows", "0",
|
||||
"fore", "where_s:NY", "back", "*:*",
|
||||
"json.facet", ""
|
||||
+ "{x: { type: terms, field: 'cat_s', sort: '"+sort+"', "
|
||||
+ " facet: { skg: 'relatedness($fore,$back)' } } }")
|
||||
, "facets=={count:5, x:{ buckets:["
|
||||
+ " { val:'A', count:2, "
|
||||
+ " skg : { relatedness: 0.00554, "
|
||||
//+ " foreground_count: 1, "
|
||||
//+ " foreground_size: 2, "
|
||||
//+ " background_count: 2, "
|
||||
//+ " background_size: 6,"
|
||||
+ " foreground_popularity: 0.16667,"
|
||||
+ " background_popularity: 0.33333, },"
|
||||
+ " }, "
|
||||
+ " { val:'B', count:3, "
|
||||
+ " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
|
||||
//+ " foreground_count: 1, "
|
||||
//+ " foreground_size: 2, "
|
||||
//+ " background_count: 3, "
|
||||
//+ " background_size: 6,"
|
||||
+ " foreground_popularity: 0.16667,"
|
||||
+ " background_popularity: 0.5 },"
|
||||
+ " } ] } } "
|
||||
);
|
||||
}
|
||||
|
||||
// SKG used in multiple nested facets
|
||||
//
|
||||
// we'll re-use these params in 2 requests, one will simulate a shard request
|
||||
final SolrParams nestedSKG = params
|
||||
("q", "cat_s:[* TO *]", "rows", "0", "fore", "num_i:[-1000 TO 0]", "back", "*:*", "json.facet"
|
||||
, "{x: { type: terms, field: 'cat_s', sort: 'skg desc', "
|
||||
+ " facet: { skg: 'relatedness($fore,$back)', "
|
||||
+ " y: { type: terms, field: 'where_s', sort: 'skg desc', "
|
||||
+ " facet: { skg: 'relatedness($fore,$back)' } } } } }");
|
||||
|
||||
// plain old request
|
||||
assertJQ(req(nestedSKG)
|
||||
, "facets=={count:5, x:{ buckets:["
|
||||
+ " { val:'B', count:3, "
|
||||
+ " skg : { relatedness: 0.01539, "
|
||||
//+ " foreground_count: 2, "
|
||||
//+ " foreground_size: 2, "
|
||||
//+ " background_count: 3, "
|
||||
//+ " background_size: 6, "
|
||||
+ " foreground_popularity: 0.33333,"
|
||||
+ " background_popularity: 0.5 },"
|
||||
+ " y : { buckets:["
|
||||
+ " { val:'NY', count: 1, "
|
||||
+ " skg : { relatedness: 0.00554, "
|
||||
//+ " foreground_count: 1, "
|
||||
//+ " foreground_size: 2, "
|
||||
//+ " background_count: 2, "
|
||||
//+ " background_size: 6, "
|
||||
+ " foreground_popularity: 0.16667, "
|
||||
+ " background_popularity: 0.33333, "
|
||||
+ " } }, "
|
||||
+ " { val:'NJ', count: 2, "
|
||||
+ " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
|
||||
//+ " foreground_count: 1, "
|
||||
//+ " foreground_size: 2, "
|
||||
//+ " background_count: 3, "
|
||||
//+ " background_size: 6, "
|
||||
+ " foreground_popularity: 0.16667, "
|
||||
+ " background_popularity: 0.5, "
|
||||
+ " } }, "
|
||||
+ " ] } "
|
||||
+ " }, "
|
||||
+ " { val:'A', count:2, "
|
||||
+ " skg : { relatedness:-0.01097, "
|
||||
//+ " foreground_count: 0, "
|
||||
//+ " foreground_size: 2, "
|
||||
//+ " background_count: 2, "
|
||||
//+ " background_size: 6,"
|
||||
+ " foreground_popularity: 0.0,"
|
||||
+ " background_popularity: 0.33333 },"
|
||||
+ " y : { buckets:["
|
||||
+ " { val:'NJ', count: 1, "
|
||||
+ " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
|
||||
//+ " foreground_count: 0, "
|
||||
//+ " foreground_size: 0, "
|
||||
//+ " background_count: 3, "
|
||||
//+ " background_size: 6, "
|
||||
+ " foreground_popularity: 0.0, "
|
||||
+ " background_popularity: 0.5, "
|
||||
+ " } }, "
|
||||
+ " { val:'NY', count: 1, "
|
||||
+ " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
|
||||
//+ " foreground_count: 0, "
|
||||
//+ " foreground_size: 0, "
|
||||
//+ " background_count: 2, "
|
||||
//+ " background_size: 6, "
|
||||
+ " foreground_popularity: 0.0, "
|
||||
+ " background_popularity: 0.33333, "
|
||||
+ " } }, "
|
||||
+ " ] } } ] } } ");
|
||||
|
||||
// same request, but with whitebox params testing isShard
|
||||
// to verify the raw counts/sizes
|
||||
assertJQ(req(nestedSKG,
|
||||
// fake an initial shard request
|
||||
"distrib", "false", "isShard", "true", "_facet_", "{}", "shards.purpose", "2097216")
|
||||
, "facets=={count:5, x:{ buckets:["
|
||||
+ " { val:'B', count:3, "
|
||||
+ " skg : { "
|
||||
+ " foreground_count: 2, "
|
||||
+ " foreground_size: 2, "
|
||||
+ " background_count: 3, "
|
||||
+ " background_size: 6 }, "
|
||||
+ " y : { buckets:["
|
||||
+ " { val:'NY', count: 1, "
|
||||
+ " skg : { "
|
||||
+ " foreground_count: 1, "
|
||||
+ " foreground_size: 2, "
|
||||
+ " background_count: 2, "
|
||||
+ " background_size: 6, "
|
||||
+ " } }, "
|
||||
+ " { val:'NJ', count: 2, "
|
||||
+ " skg : { "
|
||||
+ " foreground_count: 1, "
|
||||
+ " foreground_size: 2, "
|
||||
+ " background_count: 3, "
|
||||
+ " background_size: 6, "
|
||||
+ " } }, "
|
||||
+ " ] } "
|
||||
+ " }, "
|
||||
+ " { val:'A', count:2, "
|
||||
+ " skg : { "
|
||||
+ " foreground_count: 0, "
|
||||
+ " foreground_size: 2, "
|
||||
+ " background_count: 2, "
|
||||
+ " background_size: 6 },"
|
||||
+ " y : { buckets:["
|
||||
+ " { val:'NJ', count: 1, "
|
||||
+ " skg : { "
|
||||
+ " foreground_count: 0, "
|
||||
+ " foreground_size: 0, "
|
||||
+ " background_count: 3, "
|
||||
+ " background_size: 6, "
|
||||
+ " } }, "
|
||||
+ " { val:'NY', count: 1, "
|
||||
+ " skg : { "
|
||||
+ " foreground_count: 0, "
|
||||
+ " foreground_size: 0, "
|
||||
+ " background_count: 2, "
|
||||
+ " background_size: 6, "
|
||||
+ " } }, "
|
||||
+ " ] } } ] } } ");
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRepeatedNumerics() throws Exception {
|
||||
Client client = Client.localClient();
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
= JSON Facet API
|
||||
:page-tocclass: right
|
||||
|
||||
[[JSONFacetAPI]]
|
||||
== Facet & Analytics Module
|
||||
|
@ -338,17 +339,18 @@ Aggregation functions, also called *facet functions, analytic functions,* or **m
|
|||
[width="100%",cols="10%,30%,60%",options="header",]
|
||||
|===
|
||||
|Aggregation |Example |Description
|
||||
|sum |sum(sales) |summation of numeric values
|
||||
|avg |avg(popularity) |average of numeric values
|
||||
|min |min(salary) |minimum value
|
||||
|max |max(mul(price,popularity)) |maximum value
|
||||
|unique |unique(author) |number of unique values of the given field. Beyond 100 values it yields not exact estimate
|
||||
|uniqueBlock |uniqueBlock(\_root_) |same as above with smaller footprint strictly requires <<uploading-data-with-index-handlers.adoc#nested-child-documents, block index>>. The given field is expected to be unique across blocks, now only singlevalued string fields are supported, docValues are recommended.
|
||||
|hll |hll(author) |distributed cardinality estimate via hyper-log-log algorithm
|
||||
|percentile |percentile(salary,50,75,99,99.9) |Percentile estimates via t-digest algorithm. When sorting by this metric, the first percentile listed is used as the sort value.
|
||||
|sumsq |sumsq(rent) |sum of squares of field or function
|
||||
|variance |variance(rent) |variance of numeric field or function
|
||||
|stddev |stddev(rent) |standard deviation of field or function
|
||||
|sum |`sum(sales)` |summation of numeric values
|
||||
|avg |`avg(popularity)` |average of numeric values
|
||||
|min |`min(salary)` |minimum value
|
||||
|max |`max(mul(price,popularity))` |maximum value
|
||||
|unique |`unique(author)` |number of unique values of the given field. Beyond 100 values it yields not exact estimate
|
||||
|uniqueBlock |`uniqueBlock(\_root_)` |same as above with smaller footprint strictly requires <<uploading-data-with-index-handlers.adoc#nested-child-documents, block index>>. The given field is expected to be unique across blocks, now only singlevalued string fields are supported, docValues are recommended.
|
||||
|hll |`hll(author)` |distributed cardinality estimate via hyper-log-log algorithm
|
||||
|percentile |`percentile(salary,50,75,99,99.9)` |Percentile estimates via t-digest algorithm. When sorting by this metric, the first percentile listed is used as the sort value.
|
||||
|sumsq |`sumsq(rent)` |sum of squares of field or function
|
||||
|variance |`variance(rent)` |variance of numeric field or function
|
||||
|stddev |`stddev(rent)` |standard deviation of field or function
|
||||
|relatedness |`relatedness('popularity:[100 TO \*]','inStock:true')`|A function for computing a relatedness score of the documents in the domain to a Foreground set, relative to a Background set (both defined as queries). This is primarily for use when building <<Semantic Knowledge Graphs>>.
|
||||
|===
|
||||
|
||||
Numeric aggregation functions such as `avg` can be on any numeric field, or on another function of multiple numeric fields such as `avg(mul(price,popularity))`.
|
||||
|
@ -514,6 +516,126 @@ Aggregation `uniqueBlock(\_root_)` is functionally equivalent to `unique(\_root_
|
|||
It's recommended to define `limit: -1` for `uniqueBlock` calculation, like in above example,
|
||||
since default value of `limit` parameter is `10`, while `uniqueBlock` is supposed to be much faster with `-1`.
|
||||
|
||||
== Semantic Knowledge Graphs
|
||||
|
||||
The `relatedness(...)` aggregation functions allows for sets of documents to be scored relative to Foreground and Background sets of documents, for the purposes of finding ad-hoc relationships that make up a "Semantic Knowledge Graph":
|
||||
|
||||
[quote, Grainger et al., 'https://arxiv.org/abs/1609.00464[The Semantic Knowledge Graph]']
|
||||
____
|
||||
At its heart, the Semantic Knowledge Graph leverages an inverted index, along with a complementary uninverted index, to represent nodes (terms) and edges (the documents within intersecting postings lists for multiple terms/nodes). This provides a layer of indirection between each pair of nodes and their corresponding edge, enabling edges to materialize dynamically from underlying corpus statistics. As a result, any combination of nodes can have edges to any other nodes materialize and be scored to reveal latent relationships between the nodes.
|
||||
____
|
||||
|
||||
The `relatedness(...)` function is used to "score" these relationships, relative to "Foreground" and "Background" sets of documents, specified in the function params as queries.
|
||||
|
||||
Unlike most aggregation functions, the `relatedness(...)` function is aware of if/how it's used in <<NestedFacets,Nested Facets>>. It evaluates the query defining the current bucket _independently_ from it's parent/ancestor buckets, and intersects those documents with a "Foreground Set" defined by the foreground query _combined with the ancestor buckets_. The result is then compared to a similar intersection done against the "Background Set" (defined exclusively by background query) to see if there is a positive, or negative, correlation between the current bucket and the Foreground Set, relative to the Background Set.
|
||||
|
||||
=== Semantic Knowledge Graph Example
|
||||
|
||||
|
||||
.Sample Documents
|
||||
[source,bash,subs="verbatim,callouts"]
|
||||
----
|
||||
curl -sS -X POST 'http://localhost:8983/solr/gettingstarted/update?commit=true' -d '[
|
||||
{"id":"01",age:15,"state":"AZ","hobbies":["soccer","painting","cycling"]},
|
||||
{"id":"02",age:22,"state":"AZ","hobbies":["swimming","darts","cycling"]},
|
||||
{"id":"03",age:27,"state":"AZ","hobbies":["swimming","frisbee","painting"]},
|
||||
{"id":"04",age:33,"state":"AZ","hobbies":["darts"]},
|
||||
{"id":"05",age:42,"state":"AZ","hobbies":["swimming","golf","painting"]},
|
||||
{"id":"06",age:54,"state":"AZ","hobbies":["swimming","golf"]},
|
||||
{"id":"07",age:67,"state":"AZ","hobbies":["golf","painting"]},
|
||||
{"id":"08",age:71,"state":"AZ","hobbies":["painting"]},
|
||||
{"id":"09",age:14,"state":"CO","hobbies":["soccer","frisbee","skiing","swimming","skating"]},
|
||||
{"id":"10",age:23,"state":"CO","hobbies":["skiing","darts","cycling","swimming"]},
|
||||
{"id":"11",age:26,"state":"CO","hobbies":["skiing","golf"]},
|
||||
{"id":"12",age:35,"state":"CO","hobbies":["golf","frisbee","painting","skiing"]},
|
||||
{"id":"13",age:47,"state":"CO","hobbies":["skiing","darts","painting","skating"]},
|
||||
{"id":"14",age:51,"state":"CO","hobbies":["skiing","golf"]},
|
||||
{"id":"15",age:64,"state":"CO","hobbies":["skating","cycling"]},
|
||||
{"id":"16",age:73,"state":"CO","hobbies":["painting"]},
|
||||
]'
|
||||
----
|
||||
|
||||
.Example Query
|
||||
[source,bash,subs="verbatim,callouts"]
|
||||
----
|
||||
curl -sS -X POST http://localhost:8983/solr/gettingstarted/query -d 'rows=0&q=*:*
|
||||
&back=*:* # <1>
|
||||
&fore=age:[35 TO *] # <2>
|
||||
&json.facet={
|
||||
hobby : {
|
||||
type : terms,
|
||||
field : hobbies,
|
||||
limit : 5,
|
||||
sort : { r1: desc }, # <3>
|
||||
facet : {
|
||||
r1 : "relatedness($fore,$back)", # <4>
|
||||
location : {
|
||||
type : terms,
|
||||
field : state,
|
||||
limit : 2,
|
||||
sort : { r2: desc }, # <3>
|
||||
facet : {
|
||||
r2 : "relatedness($fore,$back)" # <4>
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
----
|
||||
<1> Use the entire collection as our "Background Set"
|
||||
<2> Use a query for "age >= 35" to define our (initial) "Foreground Set"
|
||||
<3> For both the top level `hobbies` facet & the sub-facet on `state` we will be sorting on the `relatedness(...)` values
|
||||
<4> In both calls to the `relatedness(...)` function, we use <<local-parameters-in-queries.adoc#parameter-dereferencing,Parameter Variables>> to refer to the previously defined `fore` and `back` queries.
|
||||
|
||||
.The Facet Response
|
||||
[source,javascript,subs="verbatim,callouts"]
|
||||
----
|
||||
"facets":{
|
||||
"count":16,
|
||||
"hobby":{
|
||||
"buckets":[{
|
||||
"val":"golf",
|
||||
"count":6, // <1>
|
||||
"r1":{
|
||||
"relatedness":0.01225,
|
||||
"foreground_popularity":0.3125, // <2>
|
||||
"background_popularity":0.375}, // <3>
|
||||
"location":{
|
||||
"buckets":[{
|
||||
"val":"az",
|
||||
"count":3,
|
||||
"r2":{
|
||||
"relatedness":0.00496, // <4>
|
||||
"foreground_popularity":0.1875, // <6>
|
||||
"background_popularity":0.5}}, // <7>
|
||||
{
|
||||
"val":"co",
|
||||
"count":3,
|
||||
"r2":{
|
||||
"relatedness":-0.00496, // <5>
|
||||
"foreground_popularity":0.125,
|
||||
"background_popularity":0.5}}]}},
|
||||
{
|
||||
"val":"painting",
|
||||
"count":8, // <1>
|
||||
"r1":{
|
||||
"relatedness":0.01097,
|
||||
"foreground_popularity":0.375,
|
||||
"background_popularity":0.5},
|
||||
"location":{
|
||||
"buckets":[{
|
||||
...
|
||||
----
|
||||
<1> Even though `hobbies:golf` has a lower total facet `count` then `hobbies:painting`, it has a higher `relatedness` score, indicating that relative to the Background Set (the entire collection) Golf has a stronger correlation to our Foreground Set (people age 35+) then Painting.
|
||||
<2> The number of documents matching `age:[35 TO *]` _and_ `hobbies:golf` is 31.25% of the total number of documents in the Background Set
|
||||
<3> 37.5% of the documents in the Background Set match `hobbies:golf`
|
||||
<4> The state of Arizona (AZ) has a _positive_ relatedness correlation with the _nested_ Foreground Set (people ages 35+ who play Golf) compared to the Background Set -- ie: "People in Arizona are statistically more likely to be '35+ year old Golfers' then the country as a whole."
|
||||
<5> The state of Colorado (CO) has a _negative_ correlation with the nested Foreground Set -- ie: "People in Colorado are statistically less likely to be '35+ year old Golfers' then the country as a whole."
|
||||
<6> The number documents matching `age:[35 TO *]` _and_ `hobbies:golf` _and_ `state:AZ` is 18.75% of the total number of documents in the Background Set
|
||||
<7> 50% of the documents in the Background Set match `state:AZ`
|
||||
|
||||
NOTE: While it's very common to define the Background Set as `\*:*`, or some other super-set of the Foreground Query, it is not strictly required. The `relatedness(...)` function can be used to compare the statistical relatedness of sets of documents to orthogonal foreground/background queries.
|
||||
|
||||
[[References]]
|
||||
== References
|
||||
|
||||
|
|
Loading…
Reference in New Issue