diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1a433289532..38cd54529cc 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -116,6 +116,9 @@ New Features * SOLR-11277: Add auto hard-commit settings based on tlog size (Rupa Shankar, Anshum Gupta) +* SOLR-9480: A new 'relatedness()' aggregate function for JSON Faceting to enable building Semantic + Knowledge Graphs. (Trey Grainger, hossman) + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java index 683cf4a6605..b7c68157a0b 100644 --- a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java +++ b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java @@ -63,6 +63,7 @@ import org.apache.solr.search.facet.PercentileAgg; import org.apache.solr.search.facet.StddevAgg; import org.apache.solr.search.facet.SumAgg; import org.apache.solr.search.facet.SumsqAgg; +import org.apache.solr.search.facet.RelatednessAgg; import org.apache.solr.search.facet.UniqueAgg; import org.apache.solr.search.facet.UniqueBlockAgg; import org.apache.solr.search.facet.VarianceAgg; @@ -1039,6 +1040,17 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin { addParser("agg_percentile", new PercentileAgg.Parser()); + addParser("agg_" + RelatednessAgg.NAME, new ValueSourceParser() { + @Override + public ValueSource parse(FunctionQParser fp) throws SyntaxError { + // TODO: (fore & back)-ground should be optional -- use hasMoreArguments + // if only one arg, assume it's the foreground + // (background is the one that will most commonly just be "*:*") + // see notes in RelatednessAgg constructor about why we don't do this yet + return new RelatednessAgg(fp.parseNestedQuery(), fp.parseNestedQuery()); + } + }); + addParser("childfield", new ChildFieldValueSourceParser()); } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetField.java b/solr/core/src/java/org/apache/solr/search/facet/FacetField.java index 1a595491a3a..4de04110f44 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetField.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetField.java @@ -42,7 +42,7 @@ abstract class FacetRequestSorted extends FacetRequest { @Override public boolean returnsPartial() { - return limit > 0; + return super.returnsPartial() || (limit > 0); } } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java index f872db35e24..727659367c7 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java @@ -36,6 +36,7 @@ import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocSet; +import org.apache.solr.search.facet.SlotAcc.SlotContext; import static org.apache.solr.search.facet.FacetContext.SKIP_FACET; @@ -226,23 +227,23 @@ abstract class FacetFieldProcessor extends FacetProcessor { } } - int collectFirstPhase(DocSet docs, int slot) throws IOException { + int collectFirstPhase(DocSet docs, int slot, IntFunction slotContext) throws IOException { int num = -1; if (collectAcc != null) { - num = collectAcc.collect(docs, slot); + num = collectAcc.collect(docs, slot, slotContext); } if (allBucketsAcc != null) { - num = allBucketsAcc.collect(docs, slot); + num = allBucketsAcc.collect(docs, slot, slotContext); } return num >= 0 ? num : docs.size(); } - void collectFirstPhase(int segDoc, int slot) throws IOException { + void collectFirstPhase(int segDoc, int slot, IntFunction slotContext) throws IOException { if (collectAcc != null) { - collectAcc.collect(segDoc, slot); + collectAcc.collect(segDoc, slot, slotContext); } if (allBucketsAcc != null) { - allBucketsAcc.collect(segDoc, slot); + allBucketsAcc.collect(segDoc, slot, slotContext); } } @@ -373,7 +374,7 @@ abstract class FacetFieldProcessor extends FacetProcessor { Comparable val = bucketValFromSlotNumFunc.apply(slotNum); bucket.add("val", val); - Query filter = needFilter ? sf.getType().getFieldQuery(null, sf, fieldQueryValFunc.apply(val)) : null; + Query filter = needFilter ? makeBucketQuery(fieldQueryValFunc.apply(val)) : null; fillBucket(bucket, countAcc.getCount(slotNum), slotNum, null, filter); @@ -388,6 +389,15 @@ abstract class FacetFieldProcessor extends FacetProcessor { return res; } + /** + * Trivial helper method for building up a bucket query given the (Stringified) bucket value + */ + protected Query makeBucketQuery(final String bucketValue) { + // TODO: this isn't viable for things like text fields w/ analyzers that are non-idempotent (ie: stemmers) + // TODO: but changing it to just use TermQuery isn't safe for things like numerics, dates, etc... + return sf.getType().getFieldQuery(null, sf, bucketValue); + } + private void calculateNumBuckets(SimpleOrderedMap target) throws IOException { DocSet domain = fcontext.base; if (freq.prefix != null) { @@ -397,7 +407,7 @@ abstract class FacetFieldProcessor extends FacetProcessor { HLLAgg agg = new HLLAgg(freq.field); SlotAcc acc = agg.createSlotAcc(fcontext, domain.size(), 1); - acc.collect(domain, 0); + acc.collect(domain, 0, null); // we know HLL doesn't care about the bucket query acc.key = "numBuckets"; acc.setValues(target, 0); } @@ -433,7 +443,7 @@ abstract class FacetFieldProcessor extends FacetProcessor { // do acc at a time (traversing domain each time) or do all accs for each doc? for (SlotAcc acc : otherAccs) { acc.reset(); // TODO: only needed if we previously used for allBuckets or missing - acc.collect(subDomain, 0); + acc.collect(subDomain, 0, slot -> { return new SlotContext(filter); }); acc.setValues(target, 0); } } @@ -442,13 +452,14 @@ abstract class FacetFieldProcessor extends FacetProcessor { } @Override - protected void processStats(SimpleOrderedMap bucket, DocSet docs, int docCount) throws IOException { + protected void processStats(SimpleOrderedMap bucket, Query bucketQ, DocSet docs, int docCount) throws IOException { if (docCount == 0 && !freq.processEmpty || freq.getFacetStats().size() == 0) { bucket.add("count", docCount); return; } createAccs(docCount, 1); - int collected = collect(docs, 0); + assert null != bucketQ; + int collected = collect(docs, 0, slotNum -> { return new SlotContext(bucketQ); }); // countAcc.incrementCount(0, collected); // should we set the counton the acc instead of just passing it? @@ -499,9 +510,9 @@ abstract class FacetFieldProcessor extends FacetProcessor { } @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { for (SlotAcc acc : subAccs) { - acc.collect(doc, slot); + acc.collect(doc, slot, slotContext); } } @@ -561,15 +572,15 @@ abstract class FacetFieldProcessor extends FacetProcessor { } @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { assert slot != collectAccSlot || slot < 0; count++; if (collectAcc != null) { - collectAcc.collect(doc, collectAccSlot); + collectAcc.collect(doc, collectAccSlot, slotContext); } if (otherAccs != null) { for (SlotAcc otherAcc : otherAccs) { - otherAcc.collect(doc, otherAccsSlot); + otherAcc.collect(doc, otherAccsSlot, slotContext); } } } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java index 43bafd46ca2..e5ee181b5cb 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java @@ -19,11 +19,14 @@ package org.apache.solr.search.facet; import java.io.IOException; import java.util.Date; +import java.util.function.IntFunction; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.search.Query; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.schema.SchemaField; +import org.apache.solr.search.facet.SlotAcc.SlotContext; import static org.apache.solr.search.facet.FacetContext.SKIP_FACET; @@ -116,8 +119,28 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor { throw new RuntimeException(e); } }, - obj -> obj instanceof Date ? ((Date)obj).toInstant().toString() : obj.toString() + obj -> valueObjToString(obj) ); } + private static String valueObjToString(Object obj) { + return (obj instanceof Date) ? ((Date)obj).toInstant().toString() : obj.toString(); + } + + + /** + * SlotContext to use during all {@link SlotAcc} collection. + * + * @see #lookupOrd + */ + public IntFunction slotContext = (slotNum) -> { + try { + Object value = sf.getType().toObject(sf, lookupOrd(slotNum)); + Query q = makeBucketQuery(valueObjToString(value)); + assert null != q : "null query for: '" + value + "'"; + return new SlotContext(q); + } catch (IOException e) { + throw new RuntimeException(e); + } + }; } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java index fe7a3f23937..4a0d13e2fa3 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java @@ -327,10 +327,10 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { if (arrIdx >= 0 && arrIdx < nTerms) { countAcc.incrementCount(arrIdx, 1); if (collectAcc != null) { - collectAcc.collect(doc, arrIdx); + collectAcc.collect(doc, arrIdx, slotContext); } if (allBucketsAcc != null) { - allBucketsAcc.collect(doc, arrIdx); + allBucketsAcc.collect(doc, arrIdx, slotContext); } } } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByEnumTermsStream.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByEnumTermsStream.java index bbc2973c8af..f939bba52ff 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByEnumTermsStream.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByEnumTermsStream.java @@ -21,6 +21,7 @@ import java.io.Closeable; import java.io.IOException; import java.util.Iterator; import java.util.List; +import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiPostingsEnum; @@ -40,6 +41,7 @@ import org.apache.solr.search.DocSet; import org.apache.solr.search.HashDocSet; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SortedIntDocSet; +import org.apache.solr.search.facet.SlotAcc.SlotContext; /** * Enumerates indexed terms in order in a streaming fashion. @@ -60,6 +62,13 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement PostingsEnum postingsEnum; BytesRef startTermBytes; BytesRef term; + + // at any point in processing where we need a SlotContext, all we care about is the current 'term' + private IntFunction slotContext = (slotNum) -> { + assert null != term; + return new SlotAcc.SlotContext(new TermQuery(new Term(sf.getName(), term))); + }; + LeafReaderContext[] leaves; FacetFieldProcessorByEnumTermsStream(FacetContext fcontext, FacetField freq, SchemaField sf) { @@ -195,7 +204,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement private SimpleOrderedMap _nextBucket() throws IOException { DocSet termSet = null; - + try { while (term != null) { @@ -241,7 +250,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement resetStats(); if (!countOnly) { - collect(termSet, 0); + collect(termSet, 0, slotContext); } } else { @@ -278,7 +287,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) { c++; - collect(docid, 0); + collect(docid, 0, slotContext); } } } @@ -295,7 +304,7 @@ class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implement while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) { c++; - collect(docid, 0); + collect(docid, 0, slotContext); } } } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByHashDV.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByHashDV.java index 0cafd3afc5d..4c0b244c1dd 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByHashDV.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByHashDV.java @@ -37,6 +37,7 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocSetUtil; +import org.apache.solr.search.facet.SlotAcc.SlotContext; /** * Facets numbers into a hash table. The number is either a raw numeric DocValues value, or @@ -260,7 +261,7 @@ class FacetFieldProcessorByHashDV extends FacetFieldProcessor { indexOrderAcc = new SlotAcc(fcontext) { @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { } @Override @@ -306,7 +307,7 @@ class FacetFieldProcessorByHashDV extends FacetFieldProcessor { } @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { throw new UnsupportedOperationException(); } @@ -429,7 +430,10 @@ class FacetFieldProcessorByHashDV extends FacetFieldProcessor { // Our countAcc is virtual, so this is not needed: // countAcc.incrementCount(slot, 1); - super.collectFirstPhase(segDoc, slot); + super.collectFirstPhase(segDoc, slot, slotNum -> { + Comparable value = calc.bitsToValue(val); + return new SlotContext(sf.getType().getFieldQuery(null, sf, calc.formatValue(value))); + }); } private void doRehash(LongCounts table) { diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetMerger.java b/solr/core/src/java/org/apache/solr/search/facet/FacetMerger.java index 9499d2caf25..84ffbf09ad9 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetMerger.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetMerger.java @@ -120,8 +120,10 @@ public abstract class FacetMerger { subs = null; for (Map.Entry entry : freq.subFacets.entrySet()) { - Collection childSubs = getSubsWithPartial(entry.getValue()); - if (childSubs.size() > 0 || entry.getValue().returnsPartial()) { + final FacetRequest entryVal = entry.getValue(); + Collection childSubs = getSubsWithPartial(entryVal); + // TODO: should returnsPartial() check processEmpty internally? + if (childSubs.size() > 0 || entryVal.returnsPartial() || entryVal.processEmpty) { if (subs == null) { subs = new ArrayList<>(freq.getSubFacets().size()); } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java index 27cdaec1ec2..c0625df70e8 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java @@ -24,6 +24,7 @@ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BooleanClause; @@ -42,6 +43,7 @@ import org.apache.solr.search.QParser; import org.apache.solr.search.QueryContext; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SyntaxError; +import org.apache.solr.search.facet.SlotAcc.SlotContext; import org.apache.solr.util.RTimer; public abstract class FacetProcessor { @@ -78,6 +80,7 @@ public abstract class FacetProcessor { FacetProcessor(FacetContext fcontext, FacetRequestT freq) { this.fcontext = fcontext; this.freq = freq; + fcontext.processor = this; } public Object getResponse() { @@ -90,10 +93,13 @@ public abstract class FacetProcessor { private void evalFilters() throws IOException { if (freq.domain.filters == null || freq.domain.filters.isEmpty()) return; - - List qlist = new ArrayList<>(freq.domain.filters.size()); + this.filter = fcontext.searcher.getDocSet(evalJSONFilterQueryStruct(fcontext, freq.domain.filters)); + } + + private static List evalJSONFilterQueryStruct(FacetContext fcontext, List filters) throws IOException { + List qlist = new ArrayList<>(filters.size()); // TODO: prevent parsing filters each time! - for (Object rawFilter : freq.domain.filters) { + for (Object rawFilter : filters) { if (rawFilter instanceof String) { QParser parser = null; try { @@ -149,13 +155,30 @@ public abstract class FacetProcessor { } } - - this.filter = fcontext.searcher.getDocSet(qlist); + return qlist; } private void handleDomainChanges() throws IOException { if (freq.domain == null) return; - handleFilterExclusions(); + + if (null != freq.domain.explicitQueries) { + try { + final List domainQs = evalJSONFilterQueryStruct(fcontext, freq.domain.explicitQueries); + if (domainQs.isEmpty()) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "'query' domain must not evaluate to an empty list of queries"); + } + fcontext.base = fcontext.searcher.getDocSet(domainQs); + } catch (SolrException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Unable to parse domain 'query': " + freq.domain.explicitQueries + + " -- reason: " + e.getMessage(), + e); + } + } else { + // mutualy exclusive to freq.domain.explicitQueries + handleFilterExclusions(); + } // Check filters... if we do have filters they apply after domain changes. // We still calculate them first because we can use it in a parent->child domain change. @@ -277,13 +300,13 @@ public abstract class FacetProcessor { return appliedFilters; } - protected void processStats(SimpleOrderedMap bucket, DocSet docs, int docCount) throws IOException { + protected void processStats(SimpleOrderedMap bucket, Query bucketQ, DocSet docs, int docCount) throws IOException { if (docCount == 0 && !freq.processEmpty || freq.getFacetStats().size() == 0) { bucket.add("count", docCount); return; } createAccs(docCount, 1); - int collected = collect(docs, 0); + int collected = collect(docs, 0, slotNum -> { return new SlotContext(bucketQ); }); countAcc.incrementCount(0, collected); assert collected == docCount; addStats(bucket, 0); @@ -319,10 +342,22 @@ public abstract class FacetProcessor { } } - int collect(DocSet docs, int slot) throws IOException { + int collect(DocSet docs, int slot, IntFunction slotContext) throws IOException { int count = 0; SolrIndexSearcher searcher = fcontext.searcher; + if (0 == docs.size()) { + // we may be in a "processEmpty" type situation where the client still cares about this bucket + // either way, we should let our accumulators know about the empty set, so they can collect & + // compute the slot (ie: let them decide if they care even when it's size==0) + if (accs != null) { + for (SlotAcc acc : accs) { + acc.collect(docs, slot, slotContext); // NOT per-seg collectors + } + } + return count; + } + final List leaves = searcher.getIndexReader().leaves(); final Iterator ctxIt = leaves.iterator(); LeafReaderContext ctx = null; @@ -346,15 +381,15 @@ public abstract class FacetProcessor { setNextReader(ctx); } count++; - collect(doc - segBase, slot); // per-seg collectors + collect(doc - segBase, slot, slotContext); // per-seg collectors } return count; } - void collect(int segDoc, int slot) throws IOException { + void collect(int segDoc, int slot, IntFunction slotContext) throws IOException { if (accs != null) { for (SlotAcc acc : accs) { - acc.collect(segDoc, slot); + acc.collect(segDoc, slot, slotContext); } } } @@ -402,7 +437,7 @@ public abstract class FacetProcessor { try { if (!skip) { - processStats(bucket, result, count); + processStats(bucket, q, result, count); } processSubs(bucket, q, result, skip, facetInfo); } finally { diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetRange.java b/solr/core/src/java/org/apache/solr/search/facet/FacetRange.java index 09b8ec057a5..817ada6c4cc 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetRange.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetRange.java @@ -38,6 +38,7 @@ import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.TrieDateField; import org.apache.solr.schema.TrieField; import org.apache.solr.search.DocSet; +import org.apache.solr.search.facet.SlotAcc.SlotContext; import org.apache.solr.util.DateMathParser; import static org.apache.solr.search.facet.FacetContext.SKIP_FACET; @@ -364,7 +365,7 @@ class FacetRangeProcessor extends FacetProcessor { DocSet intersection = fcontext.searcher.getDocSet(rangeQ, fcontext.base); filters[slot] = rangeQ; intersections[slot] = intersection; // save for later // TODO: only save if number of slots is small enough? - int num = collect(intersection, slot); + int num = collect(intersection, slot, slotNum -> { return new SlotContext(rangeQ); }); countAcc.incrementCount(slot, num); // TODO: roll this into collect() } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java b/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java index 4cf8a689778..6337bcbc3b3 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java @@ -87,6 +87,15 @@ public abstract class FacetRequest { // domain changes public static class Domain { + /** + * An explicit query domain, ignoring all parent context, expressed in JSON query format. + * Mutually exclusive to {@link #excludeTags} + */ + public List explicitQueries; // list of symbolic filters (JSON query format) + /** + * Specifies query/filter tags that should be excluded to re-compute the domain from the parent context. + * Mutually exclusive to {@link #explicitQueries} + */ public List excludeTags; public JoinField joinField; public boolean toParent; @@ -96,12 +105,13 @@ public abstract class FacetRequest { // True if a starting set of documents can be mapped onto a different set of documents not originally in the starting set. public boolean canTransformDomain() { - return toParent || toChildren || (excludeTags != null) || (joinField != null); + return toParent || toChildren + || (explicitQueries != null) || (excludeTags != null) || (joinField != null); } // Can this domain become non-empty if the input domain is empty? This does not check any sub-facets (see canProduceFromEmpty for that) public boolean canBecomeNonEmpty() { - return excludeTags != null; + return (explicitQueries != null) || (excludeTags != null); } /** Are we doing a query time join across other documents */ @@ -197,6 +207,7 @@ public abstract class FacetRequest { * This is normally true only for facets with a limit. */ public boolean returnsPartial() { + // TODO: should the default impl check processEmpty ? return false; } @@ -242,6 +253,7 @@ class FacetContext { public static final int IS_REFINEMENT=0x02; public static final int SKIP_FACET=0x04; // refinement: skip calculating this immediate facet, but proceed to specific sub-facets based on facetInfo + FacetProcessor processor; Map facetInfo; // refinement info for this node QueryContext qcontext; SolrQueryRequest req; // TODO: replace with params? @@ -459,6 +471,17 @@ abstract class FacetParser { domain.excludeTags = excludeTags; } + if (domainMap.containsKey("query")) { + domain.explicitQueries = parseJSONQueryStruct(domainMap.get("query")); + if (null == domain.explicitQueries) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "'query' domain can not be null or empty"); + } else if (null != domain.excludeTags) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "'query' domain can not be combined with 'excludeTags'"); + } + } + String blockParent = (String)domainMap.get("blockParent"); String blockChildren = (String)domainMap.get("blockChildren"); @@ -475,21 +498,29 @@ abstract class FacetParser { Object filterOrList = domainMap.get("filter"); if (filterOrList != null) { assert domain.filters == null; - if (filterOrList instanceof List) { - domain.filters = (List)filterOrList; - } else { - domain.filters = new ArrayList<>(1); - domain.filters.add(filterOrList); - } + domain.filters = parseJSONQueryStruct(filterOrList); } - } // end "domain" - - } } + /** returns null on null input, otherwise returns a list of the JSON query structures -- either + * directly from the raw (list) input, or if raw input is a not a list then it encapsulates + * it in a new list. + */ + private List parseJSONQueryStruct(Object raw) { + List result = null; + if (null == raw) { + return result; + } else if (raw instanceof List) { + result = (List) raw; + } else { + result = new ArrayList<>(1); + result.add(raw); + } + return result; + } public String getField(Map args) { Object fieldName = args.get("field"); // TODO: pull out into defined constant diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetRequestSortedMerger.java b/solr/core/src/java/org/apache/solr/search/facet/FacetRequestSortedMerger.java index 9ffdea7835a..d7378855887 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetRequestSortedMerger.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetRequestSortedMerger.java @@ -162,7 +162,7 @@ abstract class FacetRequestSortedMerger slotContext) throws IOException { int valuesDocID = docIdSetIterator().docID(); if (valuesDocID < doc) { valuesDocID = docIdSetIterator().advance(doc); diff --git a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java index 8d4dc4dee19..1c961e06279 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/MinMaxAgg.java @@ -19,6 +19,7 @@ package org.apache.solr.search.facet; import java.io.IOException; import java.util.Arrays; import java.util.Date; +import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; @@ -142,7 +143,7 @@ public class MinMaxAgg extends SimpleAggValueSource { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { double val = values.doubleVal(doc); if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query @@ -171,7 +172,7 @@ public class MinMaxAgg extends SimpleAggValueSource { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { long val = values.longVal(doc); if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query @@ -230,7 +231,7 @@ public class MinMaxAgg extends SimpleAggValueSource { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { long val = values.longVal(doc); if (val == 0 && !values.exists(doc)) return; // depend on fact that non existing values return 0 for func query @@ -334,7 +335,7 @@ public class MinMaxAgg extends SimpleAggValueSource { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { if (subDv.advanceExact(doc)) { int segOrd = subDv.ordValue(); int ord = toGlobal==null ? segOrd : (int)toGlobal.get(segOrd); diff --git a/solr/core/src/java/org/apache/solr/search/facet/PercentileAgg.java b/solr/core/src/java/org/apache/solr/search/facet/PercentileAgg.java index ea46a916ddf..efdef553a80 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/PercentileAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/PercentileAgg.java @@ -21,6 +21,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.function.IntFunction; import com.tdunning.math.stats.AVLTreeDigest; import org.apache.lucene.queries.function.ValueSource; @@ -109,7 +110,7 @@ public class PercentileAgg extends SimpleAggValueSource { digests = new AVLTreeDigest[numSlots]; } - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { if (!values.exists(doc)) return; double val = values.doubleVal(doc); diff --git a/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java new file mode 100644 index 00000000000..5bc1108a759 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java @@ -0,0 +1,449 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Map; +import java.util.function.IntFunction; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.queries.function.FunctionValues; +import org.apache.lucene.search.Query; + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.search.DocSet; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An aggregation function designed to be nested under other (possibly deeply nested) facets for + * the purposes of computing the "relatedness" of facet buckets relative to + * "foreground" and "background" sets -- primarily for the purpose of building "Semantic Knowledge Graphs" + * + * @see The Semantic Knowledge Graph: + * A compact, auto-generated model for real-time traversal and ranking of any relationship + * within a domain + */ +public class RelatednessAgg extends AggValueSource { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + // end user values + private static final String RELATEDNESS = "relatedness"; + private static final String FG_POP = "foreground_popularity"; + private static final String BG_POP = "background_popularity"; + + // needed for distrib calculation + private static final String FG_SIZE = "foreground_size"; + private static final String FG_COUNT = "foreground_count"; + private static final String BG_SIZE = "background_size"; + private static final String BG_COUNT = "background_count"; + + final protected Query fgQ; + final protected Query bgQ; + + public static final String NAME = RELATEDNESS; + public RelatednessAgg(Query fgQ, Query bgQ) { + super(NAME); + // NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null + // keep them null until it's time to created a SlotAcc, at which point we might inherit values + // from an ancestor facet context w/same key -- see comments in createSlotAcc + this.fgQ = fgQ; + this.bgQ = bgQ; + + // TODO: defaults not supported yet -- see comments in createSlotAcc + if (null == fgQ || null == bgQ) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + NAME + " aggregate function requires both foreground & background " + + "to be real (non-null) queries"); + } + } + + @Override + public String description() { + // TODO: need better output processing when we start supporting null fgQ/bgQ in constructor + return name +"(" + fgQ + "," + bgQ + ")"; + } + + @Override + public boolean equals(Object o) { + if (! Objects.equals(this.getClass(), o.getClass())) { + return false; + } + RelatednessAgg that = (RelatednessAgg) o; + return Objects.equals(fgQ, that.fgQ) && Objects.equals(bgQ, that.bgQ); + } + + @Override + public int hashCode() { + return Objects.hash(getClass(), fgQ, bgQ); + } + + @Override + public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException { + throw new UnsupportedOperationException("NOT IMPLEMENTED " + name + " " + this); + } + + + public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) throws IOException { + // TODO: Ideally this is where we should check fgQ/bgQ for 'null' and apply defaults... + // + // we want to walk up the fcontext and inherit the queries from any ancestor SKGAgg + // with the same "key" that we have in our own context -- and as a last resort use + // "$q" for the foreground and "*:*" for the bgQ (if no ancestors) + // (Hmmm... or maybe we should use the "Domain" of our FacetRequest as the default bg?) + // + // How do we find our what key we have in the current context? + // loop over all the stats in the current context until we find one that's '==' to this??? + + List fgFilters = new ArrayList(3); + fgFilters.add(fgQ); + for (FacetContext ctx = fcontext; ctx != null; ctx = ctx.parent) { + if (null != ctx.filter) { + fgFilters.add(ctx.filter); + } else { + // sanity check... + // the only way the filter on the current context should be null is... + assert (// 1) it's the actual top most context, + // (ie: the func is directly used w/o being nested under a facet) + (null == ctx.parent && fcontext == ctx) || + // 2) it's a child of the top most context + // (ie: the context of a top level facet) + (null == ctx.parent.parent && null == ctx.parent.filter)); + // either way, no reason to keep looping up the (0 or 1) remaining ancestors + // (which is why #1 can assert '&& fcontext == ctx') + break; + } + } + + return new SKGSlotAcc(fcontext, numSlots, fgFilters, bgQ); + } + + @Override + public FacetMerger createFacetMerger(Object prototype) { + return new Merger(); + } + + private static final class SKGSlotAcc extends SlotAcc { + private BucketData[] slotvalues; + private final DocSet fgSet; + private final DocSet bgSet; + private final long fgSize; + private final long bgSize; + private final List fgFilters; + private final Query bgQ; + public SKGSlotAcc(FacetContext fcontext, int numSlots, + List fgFilters, Query bgQ) throws IOException { + super(fcontext); + this.fgFilters = fgFilters; + this.bgQ = bgQ; + this.fgSet = fcontext.searcher.getDocSet(fgFilters); + this.bgSet = fcontext.searcher.getDocSet(bgQ); + // cache the set sizes for frequent re-use on every slot + this.fgSize = fgSet.size(); + this.bgSize = bgSet.size(); + this.slotvalues = new BucketData[numSlots]; + reset(); + } + + private void processSlot(int slot, IntFunction slotContext) throws IOException { + + assert null != slotContext; + + Query slotQ = slotContext.apply(slot).getSlotQuery(); + if (null == slotQ) { + // extremeley special edge case... + // the only way this should be possible is if our skg() function is used as a "top level" stat + // w/o being nested under any facet, in which case it should be a FacetQuery w/no parent... + assert fcontext.processor.freq instanceof FacetQuery : fcontext.processor.freq; + assert null == fcontext.parent; + assert null == fcontext.filter; + } + // ...and in which case we should just use the current base + final DocSet slotSet = null == slotQ ? fcontext.base : fcontext.searcher.getDocSet(slotQ); + + final BucketData slotVal = new BucketData(); + slotVal.incSizes(fgSize, bgSize); + slotVal.incCounts(fgSet.intersectionSize(slotSet), + bgSet.intersectionSize(slotSet)); + slotvalues[slot] = slotVal; + } + + @Override + public void collect(int perSegDocId, int slot, IntFunction slotContext) throws IOException { + // NOTE: we don't actaully care about the individual docs being collected + // (the only reason we even bother implementing this method is because it's needed for sorting + // buckets by a function) + + // so we only worry about ensuring that every "slot" / bucket is processed the first time + // we're asked about it... + if (null == slotvalues[slot]) { + processSlot(slot, slotContext); + } + } + + @Override + public int collect(DocSet docs, int slot, IntFunction slotContext) throws IOException { + // NOTE: we don't actaully care about the doc set being collected for the bucket + // so we only worry about ensuring that every "slot" / bucket is processed exactly once + + // if we're doing bulk collection, we better not be getting asked to re-use slots + assert null == slotvalues[slot]; + processSlot(slot, slotContext); + + // we don't do any filtering, we collect the whole docset, so return that as out collected count + // (as a stat, we're actually required to return this by assertions in FacetFieldProcessor.processStats) + return docs.size(); + } + + public int compare(int slotA, int slotB) { + final BucketData a = slotvalues[slotA]; + final BucketData b = slotvalues[slotB]; + + // we initialize & reset() (unused) slotvalues elements to null + // but we should never be asked to compare a slot that hasn't been collected... + assert null != a; + assert null != b; + return a.compareTo(b); + } + + @Override + public Object getValue(int slotNum) { + BucketData slotVal = slotvalues[slotNum]; + if (null == slotVal) { + // since we haven't been told about any docs for this slot, use a slot w/no counts, + // just the known fg/bg sizes. (this is most likely a refinement request for a bucket we dont have) + slotVal = new BucketData(); + slotVal.incSizes(fgSize, bgSize); + } + + SimpleOrderedMap res = slotVal.externalize(fcontext.isShard()); + return res; + } + + @Override + public void reset() { + Arrays.fill(slotvalues, null); + } + + @Override + public void resize(Resizer resizer) { + slotvalues = resizer.resize(slotvalues, null); + } + + @Override + public void close() throws IOException { + slotvalues = null; + } + } + + /** + * Encapsulates all data needed for a single bucket/slot + * + * @see SKGSlotAcc + * @see Merger + */ + private static final class BucketData implements Comparable { + + private long fg_size = 0; + private long bg_size = 0; + private long fg_count = 0; + private long bg_count = 0; + private double relatedness = Double.NaN; + + public BucketData() { + /* No-Op */ + } + + /** + * Increment both the foreground & background counts for the current bucket, reseting any + * derived values that may be cached + */ + public void incCounts(final long fgInc, final long bgInc) { + this.relatedness = Double.NaN; + fg_count += fgInc; + bg_count += bgInc; + } + /** + * Increment both the foreground & background sizes for the current bucket, reseting any + * derived values that may be cached + */ + public void incSizes(final long fgInc, final long bgInc) { + this.relatedness = Double.NaN; + fg_size += fgInc; + bg_size += bgInc; + } + + @Override + public int hashCode() { + return Objects.hash(this.getClass(), fg_count, bg_count, fg_size, bg_size); + } + + @Override + public boolean equals(Object other) { + if (!Objects.equals(this.getClass(), other.getClass())) { + return false; + } + BucketData that = (BucketData)other; + // we will most certainly be compared to other buckets of the same Agg instance, so compare counts first + return Objects.equals(this.fg_count, that.fg_count) + && Objects.equals(this.bg_count, that.bg_count) + && Objects.equals(this.fg_size, that.fg_size) + && Objects.equals(this.bg_size, that.bg_size); + } + + /** + * Computes (and caches) the derived relatedness score for this bucket + */ + private double getRelatedness() { + if (Double.isNaN(this.relatedness)) { + this.relatedness = computeRelatedness(this.fg_count, this.fg_size, + this.bg_count, this.bg_size); + // TODO: add support for a "min_pop" option... + // + // if min_pop is configured, and either (fg|bg) popularity is lower then that value + // then "this.relatedness=-Infinity" so it sorts at the bottom + // this logic be ignored on isShard requests -- similar to how shards ignore 'mincount' + } + return this.relatedness; + } + + @Override + public int compareTo(BucketData that) { + // TODO: add support for a "sort_val" option... + // + // default should be "relatedness" but also support "foreground" and "background" ... + // either of those should sort by the corrisponding ratio + // To do this, we should probably precommpute the ratios in incCounts + + int r = Double.compare(this.getRelatedness(), that.getRelatedness()); + if (0 == r) { + r = Long.compare(this.fg_count, that.fg_count); + } + if (0 == r) { + r = Long.compare(this.bg_count, that.bg_count); + } + return r; + } + + /** + * @see SlotAcc#getValue + * @see Merger#getMergedResult + */ + public SimpleOrderedMap externalize(final boolean isShardRequest) { + SimpleOrderedMap result = new SimpleOrderedMap(); + + if (isShardRequest) { + result.add(FG_COUNT, fg_count); + result.add(BG_COUNT, bg_count); + // NOTE: sizes will be the same for every slot... + // TODO: it would be nice to put them directly in the parent facet, instead of every bucket, + // in order to reduce the size of the response. + result.add(FG_SIZE, fg_size); + result.add(BG_SIZE, bg_size); + } else { + // there's no need to bother computing these when returning results *to* a shard coordinator + // only useful to external clients + result.add(RELATEDNESS, this.getRelatedness()); + result.add(FG_POP, roundTo5Digits((double) fg_count / bg_size)); // yes, BACKGROUND size is intentional + result.add(BG_POP, roundTo5Digits((double) bg_count / bg_size)); + } + + return result; + } + } + + /** + * Merges in the per shard {@link BucketData} output into a unified {@link BucketData} + */ + private static final class Merger extends FacetSortableMerger { + private final BucketData mergedData = new BucketData(); + + @Override + public void merge(Object facetResult, Context mcontext) { + NamedList shardData = (NamedList)facetResult; + mergedData.incSizes((Long)shardData.remove(FG_SIZE), (Long)shardData.remove(BG_SIZE)); + mergedData.incCounts((Long)shardData.remove(FG_COUNT), (Long)shardData.remove(BG_COUNT)); + } + + @Override + public int compareTo(FacetSortableMerger other, FacetRequest.SortDirection direction) { + // NOTE: regardless of the SortDirection hint, we want normal comparison of the BucketData + + assert other instanceof Merger; + Merger that = (Merger)other; + return mergedData.compareTo(that.mergedData); + } + + @Override + public Object getMergedResult() { + return mergedData.externalize(false); + } + } + + + /** + * This is an aproximated Z-Score, as described in the "Scoring Semantic Relationships" + * section of "The Semantic Knowledge Graph: + * A compact, auto-generated model for real-time traversal and ranking of any relationship + * within a domain" + * + * See Also: + */ + // NOTE: javadoc linter freaks out if we try doing those links as '@see )' or the (deprecated)" + + "'collect(int,int)'"); + } + + /** + * All subclasses should override this method, for backcompatability the default implementaion + * delegates to the (deprecated) {@link #collect(int,int)} + * + * @param doc Single Segment docId (relative to the current {@link LeafReaderContext} to collect + * @param slot The slot number to collect this document in + * @param slotContext A callback that can be used for Accumulators that would like additional info + * about the current slot -- the {@link IntFunction} is only garunteed to be valid for + * the current slot, and the {@link SlotContext} returned is only valid for the duration + * of the collect() call. + */ + @Deprecated + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { + collect(doc,slot); + } - public int collect(DocSet docs, int slot) throws IOException { + /** + * Bulk collection of all documents in a slot. The default implementation calls {@link #collect(int,int,IntFunction)} + * + * @param docs (global) Documents to collect + * @param slot The slot number to collect these documents in + * @param slotContext A callback that can be used for Accumulators that would like additional info + * about the current slot -- the {@link IntFunction} is only garunteed to be valid for + * the current slot, and the {@link SlotContext} returned is only valid for the duration + * of the collect() call. + */ + public int collect(DocSet docs, int slot, IntFunction slotContext) throws IOException { int count = 0; SolrIndexSearcher searcher = fcontext.searcher; @@ -94,7 +132,7 @@ public abstract class SlotAcc implements Closeable { setNextReader(ctx); } count++; - collect(doc - segBase, slot); // per-seg collectors + collect(doc - segBase, slot, slotContext); // per-seg collectors } return count; } @@ -212,6 +250,19 @@ public abstract class SlotAcc implements Closeable { } // end class Resizer + /** + * Incapsulates information about the current slot, for Accumulators that may want + * additional info during collection. + */ + public static final class SlotContext { + private final Query slotQuery; + public SlotContext(Query slotQuery) { + this.slotQuery = slotQuery; + } + public Query getSlotQuery() { + return slotQuery; + } + } } // TODO: we should really have a decoupled value provider... @@ -349,7 +400,7 @@ class SumSlotAcc extends DoubleFuncSlotAcc { super(values, fcontext, numSlots); } - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { double val = values.doubleVal(doc); // todo: worth trying to share this value across multiple stats that need it? result[slotNum] += val; } @@ -361,7 +412,7 @@ class SumsqSlotAcc extends DoubleFuncSlotAcc { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { double val = values.doubleVal(doc); val = val * val; result[slotNum] += val; @@ -386,7 +437,7 @@ class AvgSlotAcc extends DoubleFuncSlotAcc { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { double val = values.doubleVal(doc); if (val != 0 || values.exists(doc)) { result[slotNum] += val; @@ -479,7 +530,7 @@ class VarianceSlotAcc extends DoubleFuncSlotAcc { } @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { double val = values.doubleVal(doc); if (values.exists(doc)) { counts[slot]++; @@ -541,7 +592,7 @@ class StddevSlotAcc extends DoubleFuncSlotAcc { } @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { double val = values.doubleVal(doc); if (values.exists(doc)) { counts[slot]++; @@ -570,8 +621,9 @@ class CountSlotArrAcc extends CountSlotAcc { } @Override - public void collect(int doc, int slotNum) { // TODO: count arrays can use fewer bytes based on the number of docs in - // the base set (that's the upper bound for single valued) - look at ttf? + public void collect(int doc, int slotNum, IntFunction slotContext) { + // TODO: count arrays can use fewer bytes based on the number of docs in + // the base set (that's the upper bound for single valued) - look at ttf? result[slotNum]++; } @@ -615,7 +667,7 @@ class SortSlotAcc extends SlotAcc { } @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { // no-op } diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java index 69b341a29c6..3349bb2115b 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java @@ -44,6 +44,7 @@ import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocSet; import org.apache.solr.search.SolrCache; import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.search.facet.SlotAcc.SlotContext; import org.apache.solr.uninverting.DocTermOrds; import org.apache.solr.util.TestInjection; import org.slf4j.Logger; @@ -431,7 +432,8 @@ public class UnInvertedField extends DocTermOrds { if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) { // handle the biggest terms DocSet intersection = searcher.getDocSet(tt.termQuery, docs); - int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex); + int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex, + slotNum -> { return new SlotContext(tt.termQuery); }); countAcc.incrementCount(tt.termNum - startTermIndex, collected); if (collected > 0) { uniqueTerms++; @@ -493,7 +495,7 @@ public class UnInvertedField extends DocTermOrds { if (arrIdx < 0) continue; if (arrIdx >= nTerms) break; countAcc.incrementCount(arrIdx, 1); - processor.collectFirstPhase(segDoc, arrIdx); + processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); } } else { int tnum = 0; @@ -507,7 +509,7 @@ public class UnInvertedField extends DocTermOrds { if (arrIdx >= 0) { if (arrIdx >= nTerms) break; countAcc.incrementCount(arrIdx, 1); - processor.collectFirstPhase(segDoc, arrIdx); + processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); } delta = 0; } diff --git a/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java b/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java index d858b5bd45c..5d9cf90b10e 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UniqueAgg.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.function.IntFunction; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.LeafReaderContext; @@ -145,7 +146,7 @@ public class UniqueAgg extends StrAggValueSource { } @Override - public void collect(int doc, int slot) throws IOException { + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { int valuesDocID = docIdSetIterator().docID(); if (valuesDocID < doc) { valuesDocID = docIdSetIterator().advance(doc); diff --git a/solr/core/src/java/org/apache/solr/search/facet/UniqueMultiDvSlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/UniqueMultiDvSlotAcc.java index af419a4d96a..839fc520a17 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UniqueMultiDvSlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UniqueMultiDvSlotAcc.java @@ -18,6 +18,7 @@ package org.apache.solr.search.facet; import java.io.IOException; +import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; @@ -70,7 +71,7 @@ class UniqueMultiDvSlotAcc extends UniqueSlotAcc { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { if (subDv.advanceExact(doc)) { int segOrd = (int) subDv.nextOrd(); diff --git a/solr/core/src/java/org/apache/solr/search/facet/UniqueMultivaluedSlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/UniqueMultivaluedSlotAcc.java index 10adcf06dd2..508da384045 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UniqueMultivaluedSlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UniqueMultivaluedSlotAcc.java @@ -18,6 +18,7 @@ package org.apache.solr.search.facet; import java.io.IOException; +import java.util.function.IntFunction; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; @@ -50,7 +51,7 @@ class UniqueMultivaluedSlotAcc extends UniqueSlotAcc implements UnInvertedField. } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { bits = arr[slotNum]; if (bits == null) { bits = new FixedBitSet(nTerms); @@ -67,4 +68,4 @@ class UniqueMultivaluedSlotAcc extends UniqueSlotAcc implements UnInvertedField. docToTerm = null; } } -} \ No newline at end of file +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/UniqueSinglevaluedSlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/UniqueSinglevaluedSlotAcc.java index 9a1b51e9e65..ed51c5bc76b 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UniqueSinglevaluedSlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UniqueSinglevaluedSlotAcc.java @@ -18,6 +18,7 @@ package org.apache.solr.search.facet; import java.io.IOException; +import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; @@ -73,7 +74,7 @@ class UniqueSinglevaluedSlotAcc extends UniqueSlotAcc { } @Override - public void collect(int doc, int slotNum) throws IOException { + public void collect(int doc, int slotNum, IntFunction slotContext) throws IOException { if (doc > subDv.docID()) { subDv.advance(doc); } diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudJSONFacetSKG.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudJSONFacetSKG.java new file mode 100644 index 00000000000..35f23d082af --- /dev/null +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudJSONFacetSKG.java @@ -0,0 +1,654 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.cloud; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.commons.lang.StringUtils; + +import org.apache.lucene.util.LuceneTestCase.Slow; +import org.apache.lucene.util.TestUtil; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.embedded.JettySolrRunner; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.QueryRequest; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.search.facet.FacetField; +import static org.apache.solr.search.facet.RelatednessAgg.computeRelatedness; +import static org.apache.solr.search.facet.RelatednessAgg.roundTo5Digits; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + *

+ * A randomized test of nested facets using the relatedness() function, that asserts the + * accuracy the results for all the buckets returned using verification queries of the (expected) + * foreground & background queries based on the nested facet terms. + *

+ * Note that unlike normal facet "count" verification, using a high limit + overrequest isn't a substitute + * for refinement in order to ensure accurate "skg" computation across shards. For that reason, this + * tests forces refine: true (unlike {@link TestCloudJSONFacetJoinDomain}) and specifices a + * domain: { 'query':'{!v=$back' } for every facet, in order to garuntee that all popularity + * & relatedness values returned can be proven with validation requests. + *

+ *

+ * (Refinement alone is not enough. Using the background query as the facet domain is neccessary to + * prevent situations where a single shardX may return candidate bucket with no child-buckets due to + * the normal facet intersections, but when refined on other shardY(s), can produce "high scoring" + * SKG child-buckets, which would then be missing the foreground/background "size" contributions from + * shardX. + *

+ * + * + * + * @see TestCloudJSONFacetJoinDomain + */ +@Slow +public class TestCloudJSONFacetSKG extends SolrCloudTestCase { + + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static final String DEBUG_LABEL = MethodHandles.lookup().lookupClass().getName(); + private static final String COLLECTION_NAME = DEBUG_LABEL + "_collection"; + + private static final int DEFAULT_LIMIT = FacetField.DEFAULT_FACET_LIMIT; + private static final int MAX_FIELD_NUM = 15; + private static final int UNIQUE_FIELD_VALS = 50; + + /** Multivalued string field suffixes that can be randomized for testing diff facet/join code paths */ + private static final String[] STR_FIELD_SUFFIXES = new String[] { "_ss", "_sds", "_sdsS" }; + /** Multivalued int field suffixes that can be randomized for testing diff facet/join code paths */ + private static final String[] INT_FIELD_SUFFIXES = new String[] { "_is", "_ids", "_idsS" }; + + /** A basic client for operations at the cloud level, default collection will be set */ + private static CloudSolrClient CLOUD_CLIENT; + /** One client per node */ + private static ArrayList CLIENTS = new ArrayList<>(5); + + @BeforeClass + private static void createMiniSolrCloudCluster() throws Exception { + // sanity check constants + assertTrue("bad test constants: some suffixes will never be tested", + (STR_FIELD_SUFFIXES.length < MAX_FIELD_NUM) && (INT_FIELD_SUFFIXES.length < MAX_FIELD_NUM)); + + // we need DVs on point fields to compute stats & facets + if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true"); + + // multi replicas should not matter... + final int repFactor = usually() ? 1 : 2; + // ... but we definitely want to test multiple shards + final int numShards = TestUtil.nextInt(random(), 1, (usually() ? 2 :3)); + final int numNodes = (numShards * repFactor); + + final String configName = DEBUG_LABEL + "_config-set"; + final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf"); + + configureCluster(numNodes).addConfig(configName, configDir).configure(); + + Map collectionProperties = new LinkedHashMap<>(); + collectionProperties.put("config", "solrconfig-tlog.xml"); + collectionProperties.put("schema", "schema_latest.xml"); + CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, numShards, repFactor) + .setProperties(collectionProperties) + .process(cluster.getSolrClient()); + + CLOUD_CLIENT = cluster.getSolrClient(); + CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME); + + waitForRecoveriesToFinish(CLOUD_CLIENT); + + for (JettySolrRunner jetty : cluster.getJettySolrRunners()) { + CLIENTS.add(getHttpSolrClient(jetty.getBaseUrl() + "/" + COLLECTION_NAME + "/")); + } + + final int numDocs = atLeast(100); + for (int id = 0; id < numDocs; id++) { + SolrInputDocument doc = sdoc("id", ""+id); + for (int fieldNum = 0; fieldNum < MAX_FIELD_NUM; fieldNum++) { + // NOTE: some docs may have no value in a field + final int numValsThisDoc = TestUtil.nextInt(random(), 0, (usually() ? 5 : 10)); + for (int v = 0; v < numValsThisDoc; v++) { + final String fieldValue = randFieldValue(fieldNum); + + // for each fieldNum, there are actaully two fields: one string, and one integer + doc.addField(field(STR_FIELD_SUFFIXES, fieldNum), fieldValue); + doc.addField(field(INT_FIELD_SUFFIXES, fieldNum), fieldValue); + } + } + CLOUD_CLIENT.add(doc); + if (random().nextInt(100) < 1) { + CLOUD_CLIENT.commit(); // commit 1% of the time to create new segments + } + if (random().nextInt(100) < 5) { + CLOUD_CLIENT.add(doc); // duplicate the doc 5% of the time to create deleted docs + } + } + CLOUD_CLIENT.commit(); + } + + /** + * Given a (random) number, and a (static) array of possible suffixes returns a consistent field name that + * uses that number and one of hte specified suffixes in it's name. + * + * @see #STR_FIELD_SUFFIXES + * @see #INT_FIELD_SUFFIXES + * @see #MAX_FIELD_NUM + * @see #randFieldValue + */ + private static String field(final String[] suffixes, final int fieldNum) { + assert fieldNum < MAX_FIELD_NUM; + + final String suffix = suffixes[fieldNum % suffixes.length]; + return "field_" + fieldNum + suffix; + } + private static String strfield(final int fieldNum) { + return field(STR_FIELD_SUFFIXES, fieldNum); + } + private static String intfield(final int fieldNum) { + return field(INT_FIELD_SUFFIXES, fieldNum); + } + + /** + * Given a (random) field number, returns a random (integer based) value for that field. + * NOTE: The number of unique values in each field is constant acording to {@link #UNIQUE_FIELD_VALS} + * but the precise range of values will vary for each unique field number, such that cross field joins + * will match fewer documents based on how far apart the field numbers are. + * + * @see #UNIQUE_FIELD_VALS + * @see #field + */ + private static String randFieldValue(final int fieldNum) { + return "" + (fieldNum + TestUtil.nextInt(random(), 1, UNIQUE_FIELD_VALS)); + } + + + @AfterClass + private static void afterClass() throws Exception { + CLOUD_CLIENT.close(); CLOUD_CLIENT = null; + for (HttpSolrClient client : CLIENTS) { + client.close(); + } + CLIENTS = null; + } + + /** + * Test some small, hand crafted, but non-trivial queries that are + * easier to trace/debug then a pure random monstrosity. + * (ie: if something obvious gets broken, this test may fail faster and in a more obvious way then testRandom) + */ + public void testBespoke() throws Exception { + { // trivial single level facet + Map facets = new LinkedHashMap<>(); + TermFacet top = new TermFacet(strfield(9), UNIQUE_FIELD_VALS, 0, null); + facets.put("top1", top); + final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS); + assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*"); + assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS); + } + + { // trivial single level facet w/sorting on skg + Map facets = new LinkedHashMap<>(); + TermFacet top = new TermFacet(strfield(9), UNIQUE_FIELD_VALS, 0, "skg desc"); + facets.put("top2", top); + final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS); + assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*"); + assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS); + } + + { // trivial single level facet w/ 2 diff ways to request "limit = (effectively) Infinite" + // to sanity check refinement of buckets missing from other shard in both cases + + // NOTE that these two queries & facets *should* effectively identical given that the + // very large limit value is big enough no shard will ever return that may terms, + // but the "limit=-1" case it actaully triggers slightly different code paths + // because it causes FacetField.returnsPartial() to be "true" + for (int limit : new int[] { 999999999, -1 }) { + Map facets = new LinkedHashMap<>(); + facets.put("top_facet_limit__" + limit, new TermFacet(strfield(9), limit, 0, "skg desc")); + final AtomicInteger maxBuckets = new AtomicInteger(UNIQUE_FIELD_VALS); + assertFacetSKGsAreCorrect(maxBuckets, facets, strfield(7)+":11", strfield(5)+":9", "*:*"); + assertTrue("Didn't check a single bucket???", maxBuckets.get() < UNIQUE_FIELD_VALS); + } + } + } + + public void testRandom() throws Exception { + + // since the "cost" of verifying the stats for each bucket is so high (see TODO in verifySKGResults()) + // we put a safety valve in place on the maximum number of buckets that we are willing to verify + // across *all* the queries that we do. + // that way if the randomized queries we build all have relatively small facets, so be it, but if + // we get a really big one early on, we can test as much as possible, skip other iterations. + // + // (deeply nested facets may contain more buckets then the max, but we won't *check* all of them) + final int maxBucketsAllowed = atLeast(2000); + final AtomicInteger maxBucketsToCheck = new AtomicInteger(maxBucketsAllowed); + + final int numIters = atLeast(10); + for (int iter = 0; iter < numIters && 0 < maxBucketsToCheck.get(); iter++) { + + assertFacetSKGsAreCorrect(maxBucketsToCheck, TermFacet.buildRandomFacets(), + buildRandomQuery(), buildRandomQuery(), buildRandomQuery()); + } + assertTrue("Didn't check a single bucket???", maxBucketsToCheck.get() < maxBucketsAllowed); + + + } + + /** + * Generates a random query string across the randomized fields/values in the index + * + * @see #randFieldValue + * @see #field + */ + private static String buildRandomQuery() { + if (0 == TestUtil.nextInt(random(), 0,10)) { + return "*:*"; + } + final int numClauses = TestUtil.nextInt(random(), 3, 10); + final String[] clauses = new String[numClauses]; + for (int c = 0; c < numClauses; c++) { + final int fieldNum = random().nextInt(MAX_FIELD_NUM); + // keep queries simple, just use str fields - not point of test + clauses[c] = strfield(fieldNum) + ":" + randFieldValue(fieldNum); + } + return buildORQuery(clauses); + } + + private static String buildORQuery(String... clauses) { + assert 0 < clauses.length; + return "(" + StringUtils.join(clauses, " OR ") + ")"; + } + + /** + * Given a set of term facets, and top level query strings, asserts that + * the SKG stats for each facet term returned when executing that query with those foreground/background + * queries match the expected results of executing the equivilent queries in isolation. + * + * @see #verifySKGResults + */ + private void assertFacetSKGsAreCorrect(final AtomicInteger maxBucketsToCheck, + Map expected, + final String query, + final String foreQ, + final String backQ) throws SolrServerException, IOException { + final SolrParams baseParams = params("rows","0", "fore", foreQ, "back", backQ); + + final SolrParams facetParams = params("q", query, + "json.facet", ""+TermFacet.toJSONFacetParamValue(expected,null)); + final SolrParams initParams = SolrParams.wrapAppended(facetParams, baseParams); + + log.info("Doing full run: {}", initParams); + + QueryResponse rsp = null; + // JSON Facets not (currently) available from QueryResponse... + NamedList topNamedList = null; + try { + rsp = (new QueryRequest(initParams)).process(getRandClient(random())); + assertNotNull(initParams + " is null rsp?", rsp); + topNamedList = rsp.getResponse(); + assertNotNull(initParams + " is null topNamedList?", topNamedList); + } catch (Exception e) { + throw new RuntimeException("init query failed: " + initParams + ": " + + e.getMessage(), e); + } + try { + final NamedList facetResponse = (NamedList) topNamedList.get("facets"); + assertNotNull("null facet results?", facetResponse); + assertEquals("numFound mismatch with top count?", + rsp.getResults().getNumFound(), ((Number)facetResponse.get("count")).longValue()); + if (0 == rsp.getResults().getNumFound()) { + // when the query matches nothing, we should expect no top level facets + expected = Collections.emptyMap(); + } + assertFacetSKGsAreCorrect(maxBucketsToCheck, expected, baseParams, facetResponse); + } catch (AssertionError e) { + throw new AssertionError(initParams + " ===> " + topNamedList + " --> " + e.getMessage(), e); + } finally { + log.info("Ending full run"); + } + } + + /** + * Recursive helper method that walks the actual facet response, comparing the SKG results to + * the expected output based on the equivilent filters generated from the original TermFacet. + */ + private void assertFacetSKGsAreCorrect(final AtomicInteger maxBucketsToCheck, + final Map expected, + final SolrParams baseParams, + final NamedList actualFacetResponse) throws SolrServerException, IOException { + + for (Map.Entry entry : expected.entrySet()) { + final String facetKey = entry.getKey(); + final TermFacet facet = entry.getValue(); + final NamedList results = (NamedList) actualFacetResponse.get(facetKey); + assertNotNull(facetKey + " key missing from: " + actualFacetResponse, results); + final List buckets = (List) results.get("buckets"); + assertNotNull(facetKey + " has null buckets: " + actualFacetResponse, buckets); + + if (buckets.isEmpty()) { + // should only happen if the background query does not match any docs with field X + final long docsWithField = getNumFound(params("_trace", "noBuckets", + "rows", "0", + "q", facet.field+":[* TO *]", + "fq", baseParams.get("back"))); + + assertEquals(facetKey + " has no buckets, but docs in background exist with field: " + facet.field, + 0, docsWithField); + } + + // NOTE: it's important that we do this depth first -- not just because it's the easiest way to do it, + // but because it means that our maxBucketsToCheck will ensure we do a lot of deep sub-bucket checking, + // not just all the buckets of the top level(s) facet(s) + for (NamedList bucket : buckets) { + final String fieldVal = bucket.get("val").toString(); // int or stringified int + + verifySKGResults(facetKey, facet, baseParams, fieldVal, bucket); + if (maxBucketsToCheck.decrementAndGet() <= 0) { + return; + } + + final SolrParams verifyParams = SolrParams.wrapAppended(baseParams, + params("fq", facet.field + ":" + fieldVal)); + + // recursively check subFacets + if (! facet.subFacets.isEmpty()) { + assertFacetSKGsAreCorrect(maxBucketsToCheck, facet.subFacets, verifyParams, bucket); + } + } + } + } + + /** + * Verifies that the popularity & relatedness values containined in a single SKG bucket + * match the expected values based on the facet field & bucket value, as well the existing + * filterParams. + * + * @see #assertFacetSKGsAreCorrect + */ + private void verifySKGResults(String facetKey, TermFacet facet, SolrParams filterParams, + String fieldVal, NamedList bucket) + throws SolrServerException, IOException { + + final String bucketQ = facet.field+":"+fieldVal; + final NamedList skgBucket = (NamedList) bucket.get("skg"); + assertNotNull(facetKey + "/bucket:" + bucket.toString(), skgBucket); + + // TODO: make this more efficient? + // ideally we'd do a single query w/4 facet.queries, one for each count + // but formatting the queries is a pain, currently we leverage the accumulated fq's + final long fgSize = getNumFound(SolrParams.wrapAppended(params("_trace", "fgSize", + "rows","0", + "q","{!query v=$fore}"), + filterParams)); + final long bgSize = getNumFound(params("_trace", "bgSize", + "rows","0", + "q", filterParams.get("back"))); + + final long fgCount = getNumFound(SolrParams.wrapAppended(params("_trace", "fgCount", + "rows","0", + "q","{!query v=$fore}", + "fq", bucketQ), + filterParams)); + final long bgCount = getNumFound(params("_trace", "bgCount", + "rows","0", + "q", bucketQ, + "fq", filterParams.get("back"))); + + assertEquals(facetKey + "/bucket:" + bucket + " => fgPop should be: " + fgCount + " / " + bgSize, + roundTo5Digits((double) fgCount / bgSize), + skgBucket.get("foreground_popularity")); + assertEquals(facetKey + "/bucket:" + bucket + " => bgPop should be: " + bgCount + " / " + bgSize, + roundTo5Digits((double) bgCount / bgSize), + skgBucket.get("background_popularity")); + assertEquals(facetKey + "/bucket:" + bucket + " => relatedness is wrong", + roundTo5Digits(computeRelatedness(fgCount, fgSize, bgCount, bgSize)), + skgBucket.get("relatedness")); + + } + + + /** + * Trivial data structure for modeling a simple terms facet that can be written out as a json.facet param. + * + * Doesn't do any string escaping or quoting, so don't use whitespace or reserved json characters + */ + private static final class TermFacet { + public final String field; + public final Map subFacets = new LinkedHashMap<>(); + public final Integer limit; // may be null + public final Integer overrequest; // may be null + public final String sort; // may be null + /** Simplified constructor asks for limit = # unique vals */ + public TermFacet(String field) { + this(field, UNIQUE_FIELD_VALS, 0, "skg desc"); + + } + public TermFacet(String field, Integer limit, Integer overrequest, String sort) { + assert null != field; + this.field = field; + this.limit = limit; + this.overrequest = overrequest; + this.sort = sort; + } + + /** + * recursively generates the json.facet param value to use for testing this facet + */ + private CharSequence toJSONFacetParamValue() { + final String limitStr = (null == limit) ? "" : (", limit:" + limit); + final String overrequestStr = (null == overrequest) ? "" : (", overrequest:" + overrequest); + final String sortStr = (null == sort) ? "" : (", sort: '" + sort + "'"); + final StringBuilder sb + = new StringBuilder("{ type:terms, field:" + field + limitStr + overrequestStr + sortStr); + + // see class javadocs for why we always use refine:true & the query:$back domain for this test. + sb.append(", refine: true, domain: { query: '{!v=$back}' }, facet:"); + sb.append(toJSONFacetParamValue(subFacets, "skg : 'relatedness($fore,$back)'")); + sb.append("}"); + return sb; + } + + /** + * Given a set of (possibly nested) facets, generates a suitable json.facet param value to + * use for testing them against in a solr request. + */ + public static CharSequence toJSONFacetParamValue(final Map facets, + final String extraJson) { + assert null != facets; + if (0 == facets.size() && null == extraJson) { + return ""; + } + + StringBuilder sb = new StringBuilder("{ processEmpty: true, "); + for (String key : facets.keySet()) { + sb.append(key).append(" : ").append(facets.get(key).toJSONFacetParamValue()); + sb.append(" ,"); + } + if (null == extraJson) { + sb.setLength(sb.length() - 1); + } else { + sb.append(extraJson); + } + sb.append("}"); + return sb; + } + + /** + * Factory method for generating some random facets. + * + * For simplicity, each facet will have a unique key name. + */ + public static Map buildRandomFacets() { + // for simplicity, use a unique facet key regardless of depth - simplifies verification + // and le's us enforce a hard limit on the total number of facets in a request + AtomicInteger keyCounter = new AtomicInteger(0); + + final int maxDepth = TestUtil.nextInt(random(), 0, (usually() ? 2 : 3)); + return buildRandomFacets(keyCounter, maxDepth); + } + + /** + * picks a random value for the "sort" param, biased in favor of interesting test cases + * + * @return a sort string (w/direction), or null to specify nothing (trigger default behavior) + * @see #randomLimitParam + */ + public static String randomSortParam(Random r) { + + // IMPORTANT!!! + // if this method is modified to produce new sorts, make sure to update + // randomLimitParam to account for them if they are impacted by SOLR-12343 + final String dir = random().nextBoolean() ? "asc" : "desc"; + switch(r.nextInt(4)) { + case 0: return null; + case 1: return "count " + dir; + case 2: return "skg " + dir; + case 3: return "index " + dir; + default: throw new RuntimeException("Broken case statement"); + } + } + /** + * picks a random value for the "limit" param, biased in favor of interesting test cases + * + *

+ * NOTE: Due to SOLR-12343, we have to force an overrequest of "all" possible terms for + * some sort values. + *

+ * + * @return a number to specify in the request, or null to specify nothing (trigger default behavior) + * @see #UNIQUE_FIELD_VALS + * @see #randomSortParam + */ + public static Integer randomLimitParam(Random r, final String sort) { + if (null != sort) { + if (sort.equals("count asc") || sort.startsWith("skg")) { + // of the known types of sorts produced, these are at risk of SOLR-12343 + // so request (effectively) unlimited num buckets + return r.nextBoolean() ? UNIQUE_FIELD_VALS : -1; + } + } + final int limit = 1 + r.nextInt((int) (UNIQUE_FIELD_VALS * 1.5F)); + if (limit >= UNIQUE_FIELD_VALS && r.nextBoolean()) { + return -1; // unlimited + } else if (limit == DEFAULT_LIMIT && r.nextBoolean()) { + return null; // sometimes, don't specify limit if it's the default + } + return limit; + } + + /** + * picks a random value for the "overrequest" param, biased in favor of interesting test cases. + * + * @return a number to specify in the request, or null to specify nothing (trigger default behavior) + * @see #UNIQUE_FIELD_VALS + */ + public static Integer randomOverrequestParam(Random r) { + switch(r.nextInt(10)) { + case 0: + case 1: + case 2: + case 3: + return 0; // 40% of the time, disable overrequest to better stress refinement + case 4: + case 5: + return r.nextInt(UNIQUE_FIELD_VALS); // 20% ask for less them what's needed + case 6: + return r.nextInt(Integer.MAX_VALUE); // 10%: completley random value, statisticaly more then enough + default: break; + } + // else.... either leave param unspecified (or redundently specify the -1 default) + return r.nextBoolean() ? null : -1; + } + + /** + * recursive helper method for building random facets + * + * @param keyCounter used to ensure every generated facet has a unique key name + * @param maxDepth max possible depth allowed for the recusion, a lower value may be used depending on how many facets are returned at the current level. + */ + private static Map buildRandomFacets(AtomicInteger keyCounter, int maxDepth) { + final int numFacets = Math.max(1, TestUtil.nextInt(random(), -1, 3)); // 3/5th chance of being '1' + Map results = new LinkedHashMap<>(); + for (int i = 0; i < numFacets; i++) { + if (keyCounter.get() < 3) { // a hard limit on the total number of facets (regardless of depth) to reduce OOM risk + + final String sort = randomSortParam(random()); + final Integer limit = randomLimitParam(random(), sort); + final Integer overrequest = randomOverrequestParam(random()); + final TermFacet facet = new TermFacet(field((random().nextBoolean() + ? STR_FIELD_SUFFIXES : INT_FIELD_SUFFIXES), + random().nextInt(MAX_FIELD_NUM)), + limit, overrequest, sort); + results.put("facet_" + keyCounter.incrementAndGet(), facet); + if (0 < maxDepth) { + // if we're going wide, don't go deep + final int nextMaxDepth = Math.max(0, maxDepth - numFacets); + facet.subFacets.putAll(buildRandomFacets(keyCounter, TestUtil.nextInt(random(), 0, nextMaxDepth))); + } + } + } + return results; + } + } + + /** + * returns a random SolrClient -- either a CloudSolrClient, or an HttpSolrClient pointed + * at a node in our cluster + */ + public static SolrClient getRandClient(Random rand) { + int numClients = CLIENTS.size(); + int idx = TestUtil.nextInt(rand, 0, numClients); + + return (idx == numClients) ? CLOUD_CLIENT : CLIENTS.get(idx); + } + + /** + * Uses a random SolrClient to execture a request and returns only the numFound + * @see #getRandClient + */ + public static long getNumFound(final SolrParams req) throws SolrServerException, IOException { + return getRandClient(random()).query(req).getResults().getNumFound(); + } + + public static void waitForRecoveriesToFinish(CloudSolrClient client) throws Exception { + assert null != client.getDefaultCollection(); + AbstractDistribZkTestBase.waitForRecoveriesToFinish(client.getDefaultCollection(), + client.getZkStateReader(), + true, true, 330); + } + +} diff --git a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java index d5620767946..84c34e1c95b 100644 --- a/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java +++ b/solr/core/src/test/org/apache/solr/search/QueryEqualityTest.java @@ -1015,6 +1015,16 @@ public class QueryEqualityTest extends SolrTestCaseJ4 { "currency(amount,USD)", "currency('amount',USD)"); } + public void testFuncRelatedness() throws Exception { + SolrQueryRequest req = req("fore","foo_s:front", "back","foo_s:back"); + try { + assertFuncEquals(req, + "agg_relatedness({!query v='foo_s:front'}, {!query v='foo_s:back'})", + "agg_relatedness($fore, $back)"); + } finally { + req.close(); + } + } public void testTestFuncs() throws Exception { assertFuncEquals("sleep(1,5)", "sleep(1,5)"); diff --git a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java index 8eed4c71ba9..c7ce7c31e7e 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java +++ b/solr/core/src/test/org/apache/solr/search/facet/DebugAgg.java @@ -19,6 +19,7 @@ package org.apache.solr.search.facet; import java.io.IOException; import java.util.concurrent.atomic.AtomicLong; +import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.queries.function.ValueSource; @@ -88,8 +89,8 @@ public class DebugAgg extends AggValueSource { } @Override - public void collect(int doc, int slot) throws IOException { - sub.collect(doc, slot); + public void collect(int doc, int slot, IntFunction slotContext) throws IOException { + sub.collect(doc, slot, slotContext); } @Override @@ -126,8 +127,8 @@ public class DebugAgg extends AggValueSource { } @Override - public int collect(DocSet docs, int slot) throws IOException { - return sub.collect(docs, slot); + public int collect(DocSet docs, int slot, IntFunction slotContext) throws IOException { + return sub.collect(docs, slot, slotContext); } @Override diff --git a/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java b/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java index 0612755fb4c..ea3b5ef54e1 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java +++ b/solr/core/src/test/org/apache/solr/search/facet/DistributedFacetSimpleRefinementLongTailTest.java @@ -373,7 +373,10 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute NamedList all_facets = (NamedList) queryServer ( params( "q", "*:*", "shards", getShardsString(), "rows" , "0", "json.facet", "{ foo : { " + commonJson + " field: foo_s, facet: { " + - ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON + "} } } } }" + ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON + + // under bar, in addition to "ALL" simple stats, we also ask for skg... + ", skg : 'relatedness($skg_fore,$skg_back)' } } } } }", + "skg_fore", STAT_FIELD+":[0 TO 40]", "skg_back", STAT_FIELD+":[-10000 TO 10000]" ) ).getResponse().get("facets"); assertNotNull(all_facets); @@ -411,7 +414,7 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute List tail_bar_buckets = (List) ((NamedList)tail_Bucket.get("bar")).get("buckets"); NamedList tailB_Bucket = tail_bar_buckets.get(0); - assertEquals(ALL_STATS.size() + 2, tailB_Bucket.size()); // val,count ... NO SUB FACETS + assertEquals(ALL_STATS.size() + 3, tailB_Bucket.size()); // val,count,skg ... NO SUB FACETS assertEquals("tailB", tailB_Bucket.get("val")); assertEquals(17L, tailB_Bucket.get("count")); assertEquals(35L, tailB_Bucket.get("min")); @@ -423,6 +426,18 @@ public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistribute assertEquals(16910.0D, (double) tailB_Bucket.get("sumsq"), 0.1E-7); // assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725 assertEquals(1.70782513D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev" + + // check the SKG stats on our tailB bucket + NamedList tailB_skg = (NamedList) tailB_Bucket.get("skg"); + assertEquals(tailB_skg.toString(), + 3, tailB_skg.size()); + assertEquals(0.19990D, tailB_skg.get("relatedness")); + assertEquals(0.00334D, tailB_skg.get("foreground_popularity")); + assertEquals(0.00334D, tailB_skg.get("background_popularity")); + //assertEquals(12L, tailB_skg.get("foreground_count")); + //assertEquals(82L, tailB_skg.get("foreground_size")); + //assertEquals(12L, tailB_skg.get("background_count")); + //assertEquals(3591L, tailB_skg.get("background_size")); } } diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java index 1e62cb2bd11..687adde8e87 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacetRefinement.java @@ -407,6 +407,75 @@ public class TestJsonFacetRefinement extends SolrTestCaseHS { "}" ); + // test that SKG stat reflects merged refinement + client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]", + "json.facet", "{" + + " cat0:{ ${terms} type:terms, field: ${cat_s}, " + + " sort:'count desc', limit:1, overrequest:0, refine:true, " + + " facet:{ s:'relatedness($fore,$back)'} } }") + , "facets=={ count:8, cat0:{ buckets:[ " + + " { val:A, count:4, " + + " s : { relatedness: 0.00496, " + //+ " foreground_count: 3, " + //+ " foreground_size: 5, " + //+ " background_count: 2, " + //+ " background_size: 4, " + + " foreground_popularity: 0.75, " + + " background_popularity: 0.5, " + + " } } ] }" + + "}" + ); + + // SKG under nested facet where some terms only exist on one shard + { + // sub-bucket order should change as sort direction changes + final String jsonFacet = "" + + "{ processEmpty:true, " + + " cat0:{ ${terms} type:terms, field: ${cat_s}, " + + " sort:'count desc', limit:1, overrequest:0, refine:true, " + + " facet:{ processEmpty:true, " + + " qw1: { ${terms} type:terms, field: ${qw_s}, mincount:0, " + + " sort:'${skg_sort}', limit:100, overrequest:0, refine:true, " + + " facet:{ processEmpty:true, skg:'relatedness($fore,$back)' } } } } }"; + final String bucketQ = "" + + " { val:Q, count:1, " + + " skg : { relatedness: 1.0, " + + " foreground_popularity: 0.25, " + + " background_popularity: 0.0, " + // + " foreground_count: 1, " + // + " foreground_size: 3, " + // + " background_count: 0, " + // + " background_size: 4, " + + " } },"; + final String bucketW = "" + + " { val:W, count:1, " + + " skg : { relatedness: 0.0037, " + + " foreground_popularity: 0.25, " + + " background_popularity: 0.25, " + // + " foreground_count: 1, " + // + " foreground_size: 3, " + // + " background_count: 1, " + // + " background_size: 4, " + + " } },"; + + client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]", + "skg_sort", "skg desc", "json.facet", jsonFacet) + , "facets=={ count:8, cat0:{ buckets:[ " + + " { val:A, count:4, " + + " qw1 : { buckets:[" + + bucketQ + + bucketW + + " ] } } ] } }"); + client.testJQ(params(p, "rows", "0", "q", "*:*", "fore", "${xy_s}:X", "back", "${num_d}:[0 TO 100]", + "skg_sort", "skg asc", "json.facet", jsonFacet) + , "facets=={ count:8, cat0:{ buckets:[ " + + " { val:A, count:4, " + + " qw1 : { buckets:[" + + bucketW + + bucketQ + + " ] } } ] } }"); + } + // test partial buckets (field facet within field facet) client.testJQ(params(p, "q", "*:*", "json.facet", "{" + diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index b6afdb8bc62..4402d78a50c 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -212,6 +212,245 @@ public class TestJsonFacets extends SolrTestCaseHS { client.commit(); } + @Test + public void testExplicitQueryDomain() throws Exception { + Client client = Client.localClient(); + indexSimple(client); + + { // simple 'query' domain + + // the facet buckets for all of the requests below should be identical + // only the numFound & top level facet count should differ + final String expectedFacets + = "facets/w=={ buckets:[" + + " { val:'NJ', count:2}, " + + " { val:'NY', count:1} ] }"; + + assertJQ(req("rows", "0", "q", "cat_s:B", "json.facet", + "{w: {type:terms, field:'where_s'}}"), + "response/numFound==3", + "facets/count==3", + expectedFacets); + assertJQ(req("rows", "0", "q", "id:3", "json.facet", + "{w: {type:terms, field:'where_s', domain: { query:'cat_s:B' }}}"), + "response/numFound==1", + "facets/count==1", + expectedFacets); + assertJQ(req("rows", "0", "q", "*:*", "fq", "-*:*", "json.facet", + "{w: {type:terms, field:'where_s', domain: { query:'cat_s:B' }}}"), + "response/numFound==0", + "facets/count==0", + expectedFacets); + assertJQ(req("rows", "0", "q", "*:*", "fq", "-*:*", "domain_q", "cat_s:B", "json.facet", + "{w: {type:terms, field:'where_s', domain: { query:{param:domain_q} }}}"), + "response/numFound==0", + "facets/count==0", + expectedFacets); + } + + { // a nested explicit query domain + + // for all of the "top" buckets, the subfacet should have identical sub-buckets + final String expectedSubBuckets = "{ buckets:[ { val:'B', count:3}, { val:'A', count:2} ] }"; + assertJQ(req("rows", "0", "q", "num_i:[0 TO *]", "json.facet", + "{w: {type:terms, field:'where_s', " + + " facet: { c: { type:terms, field:'cat_s', domain: { query:'*:*' }}}}}") + , "facets/w=={ buckets:[" + + " { val:'NJ', count:2, c: " + expectedSubBuckets + "}, " + + " { val:'NY', count:1, c: " + expectedSubBuckets + "} " + + "] }" + ); + } + + { // an (effectively) empty query should produce an error + ignoreException("'query' domain can not be null"); + ignoreException("'query' domain must not evaluate to an empty list"); + for (String raw : Arrays.asList("null", "[ ]", "{param:bogus}")) { + expectThrows(SolrException.class, () -> { + assertJQ(req("rows", "0", "q", "num_i:[0 TO *]", "json.facet", + "{w: {type:terms, field:'where_s', " + + " facet: { c: { type:terms, field:'cat_s', domain: { query: "+raw+" }}}}}")); + }); + } + } + } + + + @Test + public void testSimpleSKG() throws Exception { + Client client = Client.localClient(); + indexSimple(client); + + // using relatedness() as a top level stat, not nested under any facet + // (not particularly useful, but shouldn't error either) + assertJQ(req("q", "cat_s:[* TO *]", "rows", "0", + "fore", "where_s:NY", "back", "*:*", + "json.facet", " { skg: 'relatedness($fore,$back)' }") + , "facets=={" + + " count:5, " + + " skg : { relatedness: 0.00699," + + " foreground_popularity: 0.33333," + + " background_popularity: 0.83333," + + " } }" + ); + + // simple single level facet w/skg stat & sorting + for (String sort : Arrays.asList("index asc", "skg desc")) { + // the relatedness score of each of our cat_s values is (conviniently) also alphabetical order + // so both of these sort options should produce identical output + // and testinging "index" sort allows the randomized use of "stream" processor as default to be tested + assertJQ(req("q", "cat_s:[* TO *]", "rows", "0", + "fore", "where_s:NY", "back", "*:*", + "json.facet", "" + + "{x: { type: terms, field: 'cat_s', sort: '"+sort+"', " + + " facet: { skg: 'relatedness($fore,$back)' } } }") + , "facets=={count:5, x:{ buckets:[" + + " { val:'A', count:2, " + + " skg : { relatedness: 0.00554, " + //+ " foreground_count: 1, " + //+ " foreground_size: 2, " + //+ " background_count: 2, " + //+ " background_size: 6," + + " foreground_popularity: 0.16667," + + " background_popularity: 0.33333, }," + + " }, " + + " { val:'B', count:3, " + + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated + //+ " foreground_count: 1, " + //+ " foreground_size: 2, " + //+ " background_count: 3, " + //+ " background_size: 6," + + " foreground_popularity: 0.16667," + + " background_popularity: 0.5 }," + + " } ] } } " + ); + } + + // SKG used in multiple nested facets + // + // we'll re-use these params in 2 requests, one will simulate a shard request + final SolrParams nestedSKG = params + ("q", "cat_s:[* TO *]", "rows", "0", "fore", "num_i:[-1000 TO 0]", "back", "*:*", "json.facet" + , "{x: { type: terms, field: 'cat_s', sort: 'skg desc', " + + " facet: { skg: 'relatedness($fore,$back)', " + + " y: { type: terms, field: 'where_s', sort: 'skg desc', " + + " facet: { skg: 'relatedness($fore,$back)' } } } } }"); + + // plain old request + assertJQ(req(nestedSKG) + , "facets=={count:5, x:{ buckets:[" + + " { val:'B', count:3, " + + " skg : { relatedness: 0.01539, " + //+ " foreground_count: 2, " + //+ " foreground_size: 2, " + //+ " background_count: 3, " + //+ " background_size: 6, " + + " foreground_popularity: 0.33333," + + " background_popularity: 0.5 }," + + " y : { buckets:[" + + " { val:'NY', count: 1, " + + " skg : { relatedness: 0.00554, " + //+ " foreground_count: 1, " + //+ " foreground_size: 2, " + //+ " background_count: 2, " + //+ " background_size: 6, " + + " foreground_popularity: 0.16667, " + + " background_popularity: 0.33333, " + + " } }, " + + " { val:'NJ', count: 2, " + + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated + //+ " foreground_count: 1, " + //+ " foreground_size: 2, " + //+ " background_count: 3, " + //+ " background_size: 6, " + + " foreground_popularity: 0.16667, " + + " background_popularity: 0.5, " + + " } }, " + + " ] } " + + " }, " + + " { val:'A', count:2, " + + " skg : { relatedness:-0.01097, " + //+ " foreground_count: 0, " + //+ " foreground_size: 2, " + //+ " background_count: 2, " + //+ " background_size: 6," + + " foreground_popularity: 0.0," + + " background_popularity: 0.33333 }," + + " y : { buckets:[" + + " { val:'NJ', count: 1, " + + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated + //+ " foreground_count: 0, " + //+ " foreground_size: 0, " + //+ " background_count: 3, " + //+ " background_size: 6, " + + " foreground_popularity: 0.0, " + + " background_popularity: 0.5, " + + " } }, " + + " { val:'NY', count: 1, " + + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated + //+ " foreground_count: 0, " + //+ " foreground_size: 0, " + //+ " background_count: 2, " + //+ " background_size: 6, " + + " foreground_popularity: 0.0, " + + " background_popularity: 0.33333, " + + " } }, " + + " ] } } ] } } "); + + // same request, but with whitebox params testing isShard + // to verify the raw counts/sizes + assertJQ(req(nestedSKG, + // fake an initial shard request + "distrib", "false", "isShard", "true", "_facet_", "{}", "shards.purpose", "2097216") + , "facets=={count:5, x:{ buckets:[" + + " { val:'B', count:3, " + + " skg : { " + + " foreground_count: 2, " + + " foreground_size: 2, " + + " background_count: 3, " + + " background_size: 6 }, " + + " y : { buckets:[" + + " { val:'NY', count: 1, " + + " skg : { " + + " foreground_count: 1, " + + " foreground_size: 2, " + + " background_count: 2, " + + " background_size: 6, " + + " } }, " + + " { val:'NJ', count: 2, " + + " skg : { " + + " foreground_count: 1, " + + " foreground_size: 2, " + + " background_count: 3, " + + " background_size: 6, " + + " } }, " + + " ] } " + + " }, " + + " { val:'A', count:2, " + + " skg : { " + + " foreground_count: 0, " + + " foreground_size: 2, " + + " background_count: 2, " + + " background_size: 6 }," + + " y : { buckets:[" + + " { val:'NJ', count: 1, " + + " skg : { " + + " foreground_count: 0, " + + " foreground_size: 0, " + + " background_count: 3, " + + " background_size: 6, " + + " } }, " + + " { val:'NY', count: 1, " + + " skg : { " + + " foreground_count: 0, " + + " foreground_size: 0, " + + " background_count: 2, " + + " background_size: 6, " + + " } }, " + + " ] } } ] } } "); + + } + @Test public void testRepeatedNumerics() throws Exception { Client client = Client.localClient(); diff --git a/solr/solr-ref-guide/src/json-facet-api.adoc b/solr/solr-ref-guide/src/json-facet-api.adoc index 4f0ec7d3e4b..82501178571 100644 --- a/solr/solr-ref-guide/src/json-facet-api.adoc +++ b/solr/solr-ref-guide/src/json-facet-api.adoc @@ -1,4 +1,5 @@ = JSON Facet API +:page-tocclass: right [[JSONFacetAPI]] == Facet & Analytics Module @@ -338,17 +339,18 @@ Aggregation functions, also called *facet functions, analytic functions,* or **m [width="100%",cols="10%,30%,60%",options="header",] |=== |Aggregation |Example |Description -|sum |sum(sales) |summation of numeric values -|avg |avg(popularity) |average of numeric values -|min |min(salary) |minimum value -|max |max(mul(price,popularity)) |maximum value -|unique |unique(author) |number of unique values of the given field. Beyond 100 values it yields not exact estimate -|uniqueBlock |uniqueBlock(\_root_) |same as above with smaller footprint strictly requires <>. The given field is expected to be unique across blocks, now only singlevalued string fields are supported, docValues are recommended. -|hll |hll(author) |distributed cardinality estimate via hyper-log-log algorithm -|percentile |percentile(salary,50,75,99,99.9) |Percentile estimates via t-digest algorithm. When sorting by this metric, the first percentile listed is used as the sort value. -|sumsq |sumsq(rent) |sum of squares of field or function -|variance |variance(rent) |variance of numeric field or function -|stddev |stddev(rent) |standard deviation of field or function +|sum |`sum(sales)` |summation of numeric values +|avg |`avg(popularity)` |average of numeric values +|min |`min(salary)` |minimum value +|max |`max(mul(price,popularity))` |maximum value +|unique |`unique(author)` |number of unique values of the given field. Beyond 100 values it yields not exact estimate +|uniqueBlock |`uniqueBlock(\_root_)` |same as above with smaller footprint strictly requires <>. The given field is expected to be unique across blocks, now only singlevalued string fields are supported, docValues are recommended. +|hll |`hll(author)` |distributed cardinality estimate via hyper-log-log algorithm +|percentile |`percentile(salary,50,75,99,99.9)` |Percentile estimates via t-digest algorithm. When sorting by this metric, the first percentile listed is used as the sort value. +|sumsq |`sumsq(rent)` |sum of squares of field or function +|variance |`variance(rent)` |variance of numeric field or function +|stddev |`stddev(rent)` |standard deviation of field or function +|relatedness |`relatedness('popularity:[100 TO \*]','inStock:true')`|A function for computing a relatedness score of the documents in the domain to a Foreground set, relative to a Background set (both defined as queries). This is primarily for use when building <>. |=== Numeric aggregation functions such as `avg` can be on any numeric field, or on another function of multiple numeric fields such as `avg(mul(price,popularity))`. @@ -514,6 +516,126 @@ Aggregation `uniqueBlock(\_root_)` is functionally equivalent to `unique(\_root_ It's recommended to define `limit: -1` for `uniqueBlock` calculation, like in above example, since default value of `limit` parameter is `10`, while `uniqueBlock` is supposed to be much faster with `-1`. +== Semantic Knowledge Graphs + +The `relatedness(...)` aggregation functions allows for sets of documents to be scored relative to Foreground and Background sets of documents, for the purposes of finding ad-hoc relationships that make up a "Semantic Knowledge Graph": + +[quote, Grainger et al., 'https://arxiv.org/abs/1609.00464[The Semantic Knowledge Graph]'] +____ +At its heart, the Semantic Knowledge Graph leverages an inverted index, along with a complementary uninverted index, to represent nodes (terms) and edges (the documents within intersecting postings lists for multiple terms/nodes). This provides a layer of indirection between each pair of nodes and their corresponding edge, enabling edges to materialize dynamically from underlying corpus statistics. As a result, any combination of nodes can have edges to any other nodes materialize and be scored to reveal latent relationships between the nodes. +____ + +The `relatedness(...)` function is used to "score" these relationships, relative to "Foreground" and "Background" sets of documents, specified in the function params as queries. + +Unlike most aggregation functions, the `relatedness(...)` function is aware of if/how it's used in <>. It evaluates the query defining the current bucket _independently_ from it's parent/ancestor buckets, and intersects those documents with a "Foreground Set" defined by the foreground query _combined with the ancestor buckets_. The result is then compared to a similar intersection done against the "Background Set" (defined exclusively by background query) to see if there is a positive, or negative, correlation between the current bucket and the Foreground Set, relative to the Background Set. + +=== Semantic Knowledge Graph Example + + +.Sample Documents +[source,bash,subs="verbatim,callouts"] +---- +curl -sS -X POST 'http://localhost:8983/solr/gettingstarted/update?commit=true' -d '[ +{"id":"01",age:15,"state":"AZ","hobbies":["soccer","painting","cycling"]}, +{"id":"02",age:22,"state":"AZ","hobbies":["swimming","darts","cycling"]}, +{"id":"03",age:27,"state":"AZ","hobbies":["swimming","frisbee","painting"]}, +{"id":"04",age:33,"state":"AZ","hobbies":["darts"]}, +{"id":"05",age:42,"state":"AZ","hobbies":["swimming","golf","painting"]}, +{"id":"06",age:54,"state":"AZ","hobbies":["swimming","golf"]}, +{"id":"07",age:67,"state":"AZ","hobbies":["golf","painting"]}, +{"id":"08",age:71,"state":"AZ","hobbies":["painting"]}, +{"id":"09",age:14,"state":"CO","hobbies":["soccer","frisbee","skiing","swimming","skating"]}, +{"id":"10",age:23,"state":"CO","hobbies":["skiing","darts","cycling","swimming"]}, +{"id":"11",age:26,"state":"CO","hobbies":["skiing","golf"]}, +{"id":"12",age:35,"state":"CO","hobbies":["golf","frisbee","painting","skiing"]}, +{"id":"13",age:47,"state":"CO","hobbies":["skiing","darts","painting","skating"]}, +{"id":"14",age:51,"state":"CO","hobbies":["skiing","golf"]}, +{"id":"15",age:64,"state":"CO","hobbies":["skating","cycling"]}, +{"id":"16",age:73,"state":"CO","hobbies":["painting"]}, +]' +---- + +.Example Query +[source,bash,subs="verbatim,callouts"] +---- +curl -sS -X POST http://localhost:8983/solr/gettingstarted/query -d 'rows=0&q=*:* +&back=*:* # <1> +&fore=age:[35 TO *] # <2> +&json.facet={ + hobby : { + type : terms, + field : hobbies, + limit : 5, + sort : { r1: desc }, # <3> + facet : { + r1 : "relatedness($fore,$back)", # <4> + location : { + type : terms, + field : state, + limit : 2, + sort : { r2: desc }, # <3> + facet : { + r2 : "relatedness($fore,$back)" # <4> + } + } + } + } +}' +---- +<1> Use the entire collection as our "Background Set" +<2> Use a query for "age >= 35" to define our (initial) "Foreground Set" +<3> For both the top level `hobbies` facet & the sub-facet on `state` we will be sorting on the `relatedness(...)` values +<4> In both calls to the `relatedness(...)` function, we use <> to refer to the previously defined `fore` and `back` queries. + +.The Facet Response +[source,javascript,subs="verbatim,callouts"] +---- +"facets":{ + "count":16, + "hobby":{ + "buckets":[{ + "val":"golf", + "count":6, // <1> + "r1":{ + "relatedness":0.01225, + "foreground_popularity":0.3125, // <2> + "background_popularity":0.375}, // <3> + "location":{ + "buckets":[{ + "val":"az", + "count":3, + "r2":{ + "relatedness":0.00496, // <4> + "foreground_popularity":0.1875, // <6> + "background_popularity":0.5}}, // <7> + { + "val":"co", + "count":3, + "r2":{ + "relatedness":-0.00496, // <5> + "foreground_popularity":0.125, + "background_popularity":0.5}}]}}, + { + "val":"painting", + "count":8, // <1> + "r1":{ + "relatedness":0.01097, + "foreground_popularity":0.375, + "background_popularity":0.5}, + "location":{ + "buckets":[{ + ... +---- +<1> Even though `hobbies:golf` has a lower total facet `count` then `hobbies:painting`, it has a higher `relatedness` score, indicating that relative to the Background Set (the entire collection) Golf has a stronger correlation to our Foreground Set (people age 35+) then Painting. +<2> The number of documents matching `age:[35 TO *]` _and_ `hobbies:golf` is 31.25% of the total number of documents in the Background Set +<3> 37.5% of the documents in the Background Set match `hobbies:golf` +<4> The state of Arizona (AZ) has a _positive_ relatedness correlation with the _nested_ Foreground Set (people ages 35+ who play Golf) compared to the Background Set -- ie: "People in Arizona are statistically more likely to be '35+ year old Golfers' then the country as a whole." +<5> The state of Colorado (CO) has a _negative_ correlation with the nested Foreground Set -- ie: "People in Colorado are statistically less likely to be '35+ year old Golfers' then the country as a whole." +<6> The number documents matching `age:[35 TO *]` _and_ `hobbies:golf` _and_ `state:AZ` is 18.75% of the total number of documents in the Background Set +<7> 50% of the documents in the Background Set match `state:AZ` + +NOTE: While it's very common to define the Background Set as `\*:*`, or some other super-set of the Foreground Query, it is not strictly required. The `relatedness(...)` function can be used to compare the statistical relatedness of sets of documents to orthogonal foreground/background queries. + [[References]] == References