From 40e2122b5a5b89f446e51692ef0d72e48c7b71e5 Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Thu, 9 Jul 2020 18:42:37 -0700 Subject: [PATCH] SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()" This adds a lot of "under the covers" improvements to how JSON Faceting FacetField processors work, to enable "sweeping" support when the SlotAcc used for sorting support it (currently just "relatedness()") This is a squash commit of all changes on https://github.com/magibney/lucene-solr/tree/SOLR-13132 Up to and including ca7a8e0b39840d00af9022c048346a7d84bf280d. Co-authored-by: Chris Hostetter Co-authored-by: Michael Gibney --- solr/CHANGES.txt | 3 + .../search/facet/FacetFieldProcessor.java | 79 +++++- .../facet/FacetFieldProcessorByArray.java | 34 +++ .../facet/FacetFieldProcessorByArrayDV.java | 179 ++++++++---- .../solr/search/facet/FacetProcessor.java | 1 - .../search/facet/ReadOnlyCountSlotAcc.java | 31 ++ .../solr/search/facet/RelatednessAgg.java | 174 +++++++++++- .../solr/search/facet/SingletonDISI.java | 48 ++++ .../search/facet/SingletonDocIterator.java | 52 ++++ .../org/apache/solr/search/facet/SlotAcc.java | 218 +++++++++++++- .../solr/search/facet/SweepCountAware.java | 187 ++++++++++++ .../apache/solr/search/facet/SweepDISI.java | 85 ++++++ .../solr/search/facet/SweepDocIterator.java | 87 ++++++ .../solr/search/facet/UnInvertedField.java | 71 +++-- .../apache/solr/search/facet/UnionDISI.java | 100 +++++++ .../solr/search/facet/UnionDocIterator.java | 107 +++++++ .../facet/TestCloudJSONFacetSKGEquiv.java | 268 ++++++++++++++++-- .../solr/search/facet/TestJsonFacets.java | 48 ++++ solr/solr-ref-guide/src/json-facet-api.adoc | 4 + 19 files changed, 1663 insertions(+), 113 deletions(-) create mode 100644 solr/core/src/java/org/apache/solr/search/facet/ReadOnlyCountSlotAcc.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/SingletonDISI.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/SingletonDocIterator.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/SweepCountAware.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/SweepDISI.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/SweepDocIterator.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/UnionDISI.java create mode 100644 solr/core/src/java/org/apache/solr/search/facet/UnionDocIterator.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 68880e95dfe..6a0d48da302 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -129,6 +129,9 @@ Optimizations * SOLR-14610: ReflectMapWriter to use MethodHandle instead of old reflection (noble) +* SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()" + (hossman, Michael Gibney) + Bug Fixes --------------------- (No changes) diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java index e3af5b37a58..c7b31e16540 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java @@ -33,6 +33,7 @@ import java.util.function.IntFunction; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.Query; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.PriorityQueue; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.SimpleOrderedMap; @@ -40,6 +41,8 @@ import org.apache.solr.schema.FieldType; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocSet; import org.apache.solr.search.facet.SlotAcc.SlotContext; +import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc; +import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc; import static org.apache.solr.search.facet.FacetContext.SKIP_FACET; @@ -116,7 +119,6 @@ abstract class FacetFieldProcessor extends FacetProcessor { // allow a custom count acc to be used if (countAcc == null) { countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount); - countAcc.key = "count"; } if (accs != null) { @@ -509,12 +511,12 @@ abstract class FacetFieldProcessor extends FacetProcessor { /** Helper method used solely when looping over buckets to be returned in findTopSlots */ private void fillBucketFromSlot(SimpleOrderedMap target, Slot slot, SlotAcc resortAcc) throws IOException { - final long count = countAcc.getCount(slot.slot); - target.add("count", count); - if (count <= 0 && !freq.processEmpty) return; + final int slotOrd = slot.slot; + countAcc.setValues(target, slotOrd); + if (countAcc.getCount(slotOrd) <= 0 && !freq.processEmpty) return; - if (collectAcc != null && slot.slot >= 0) { - collectAcc.setValues(target, slot.slot); + if (slotOrd >= 0 && collectAcc != null) { + collectAcc.setValues(target, slotOrd); } if (otherAccs == null && freq.subFacets.isEmpty()) return; @@ -689,7 +691,7 @@ abstract class FacetFieldProcessor extends FacetProcessor { } } - static class MultiAcc extends SlotAcc { + static class MultiAcc extends SlotAcc implements SweepableSlotAcc { final SlotAcc[] subAccs; MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) { @@ -741,6 +743,65 @@ abstract class FacetFieldProcessor extends FacetProcessor { acc.setValues(bucket, slotNum); } } + + @Override + public SlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) { + final FacetFieldProcessor p = (FacetFieldProcessor) fcontext.processor; + int j = 0; + for (int i = 0; i < subAccs.length; i++) { + final SlotAcc acc = subAccs[i]; + if (acc instanceof SweepableSlotAcc) { + SlotAcc replacement = ((SweepableSlotAcc)acc).registerSweepingAccs(baseSweepingAcc); + if (replacement == null) { + // drop acc, do not increment j + continue; + } else if (replacement != acc || j < i) { + subAccs[j] = replacement; + } + } else if (j < i) { + subAccs[j] = acc; + } + j++; + } + switch (j) { + case 0: + return null; + case 1: + return subAccs[0]; + default: + if (j == subAccs.length) { + return this; + } else { + // must resize final field subAccs + return new MultiAcc(fcontext, ArrayUtil.copyOfSubArray(subAccs, 0, j)); + } + } + } + } + + /** + * Helper method that subclasses can use to indicate they with to use sweeping. + * If {@link #countAcc} and {@link #collectAcc} support sweeping, then this method will: + *
    + *
  • replace {@link #collectAcc} with it's sweeping equivalent
  • + *
  • update {@link #allBucketsAcc}'s reference to {@link #collectAcc} (if it exists)
  • + *
+ * + * @return true if the above actions were taken + * @see SweepableSlotAcc + * @see SweepingCountSlotAcc + */ + protected boolean registerSweepingAccIfSupportedByCollectAcc() { + if (countAcc instanceof SweepingCountSlotAcc && collectAcc instanceof SweepableSlotAcc) { + final SweepingCountSlotAcc sweepingCountAcc = (SweepingCountSlotAcc)countAcc; + collectAcc = ((SweepableSlotAcc)collectAcc).registerSweepingAccs(sweepingCountAcc); + if (allBucketsAcc != null) { + allBucketsAcc.collectAcc = collectAcc; + allBucketsAcc.sweepingCountAcc = sweepingCountAcc; + } + return true; + } + return false; } private static final SlotContext ALL_BUCKETS_SLOT_CONTEXT = new SlotContext(null) { @@ -766,6 +827,7 @@ abstract class FacetFieldProcessor extends FacetProcessor { int collectAccSlot; int otherAccsSlot; long count; + SweepingCountSlotAcc sweepingCountAcc; // null unless/until sweeping is initialized SpecialSlotAcc(FacetContext fcontext, SlotAcc collectAcc, int collectAccSlot, SlotAcc[] otherAccs, int otherAccsSlot) { super(fcontext); @@ -822,6 +884,9 @@ abstract class FacetFieldProcessor extends FacetProcessor { @Override public void setValues(SimpleOrderedMap bucket, int slotNum) throws IOException { + if (sweepingCountAcc != null) { + sweepingCountAcc.setSweepValues(bucket, collectAccSlot); + } if (collectAcc != null) { collectAcc.setValues(bucket, collectAccSlot); } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java index dff72b47492..18cf46dc029 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArray.java @@ -27,6 +27,7 @@ import org.apache.lucene.search.Query; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.facet.SlotAcc.SlotContext; +import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc; import static org.apache.solr.search.facet.FacetContext.SKIP_FACET; @@ -34,6 +35,9 @@ import static org.apache.solr.search.facet.FacetContext.SKIP_FACET; * Base class for DV/UIF accumulating counts into an array by ordinal. It's * for {@link org.apache.lucene.index.SortedDocValues} and {@link org.apache.lucene.index.SortedSetDocValues} only. * It can handle terms (strings), not numbers directly but those encoded as terms, and is multi-valued capable. + * By default, this class assumes subclasses can support sweeping collection unless subclasses initialize countAcc directly in their constructors. + * + * @see SweepingCountSlotAcc */ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor { BytesRefBuilder prefixRef; @@ -56,6 +60,34 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor { /** this BytesRef may be shared across calls and should be deep-cloned if necessary */ abstract protected BytesRef lookupOrd(int ord) throws IOException; + + /** + * {@inheritDoc} + * + * This impl first initializes countAcc as a {@link SweepingCountSlotAcc} if null. + */ + @Override + protected void createAccs(long docCount, int slotCount) throws IOException { + if (countAcc == null) { + countAcc = new SweepingCountSlotAcc(slotCount, this); + } + super.createAccs(docCount, slotCount); + } + + /** + * {@inheritDoc} + * + * This impl first initializes countAcc as a {@link SweepingCountSlotAcc} if null. + */ + @Override + void createCollectAcc(int numDocs, int numSlots) throws IOException { + if (countAcc == null) { + countAcc = new SweepingCountSlotAcc(numSlots, this); + } + super.createCollectAcc(numDocs, numSlots); + registerSweepingAccIfSupportedByCollectAcc(); + } + @Override public void process() throws IOException { super.process(); @@ -87,8 +119,10 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor { if (freq.allBuckets) { // count is irrelevant, but hardcoded in collect(...), so intercept/mask normal counts. // Set here to prevent createAccs(...) from creating a 1-slot countAcc that will fail with AIOOBE + // NOTE: because collectAcc will be null, it is fine/irrelevant to set a countAcc that doesn't support sweeping countAcc = SlotAcc.DEV_NULL_SLOT_ACC; createAccs(nDocs, 1); + assert collectAcc == null; otherAccs = accs; // accs is created above and set on allBucketsAcc; but during collection, setNextReader is called on otherAccs. allBucketsAcc = new SpecialSlotAcc(fcontext, null, -1, accs, 0); collectDocs(); diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java index dfd1bc18e18..a98ffd059b0 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessorByArrayDV.java @@ -26,14 +26,17 @@ import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.OrdinalMap; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; import org.apache.lucene.util.UnicodeUtil; import org.apache.solr.common.SolrException; import org.apache.solr.schema.SchemaField; -import org.apache.solr.search.Filter; +import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; +import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct; +import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc; +import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal; +import org.apache.solr.search.facet.SweepCountAware.SegCountPerSeg; import org.apache.solr.uninverting.FieldCacheImpl; /** @@ -94,6 +97,10 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { return; } + final SweepCountAccStruct base = SweepingCountSlotAcc.baseStructOf(this); + final List others = SweepingCountSlotAcc.otherStructsOf(this); + assert null != base; + // TODO: refactor some of this logic into a base class boolean countOnly = collectAcc==null && allBucketsAcc==null; boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount(); @@ -118,16 +125,21 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { if (freq.perSeg != null) accumSeg = canDoPerSeg && freq.perSeg; // internal - override perSeg heuristic + final int maxSize = others.size() + 1; // others + base final List leaves = fcontext.searcher.getIndexReader().leaves(); - Filter filter = fcontext.base.getTopFilter(); + final DocIdSetIterator[] subIterators = new DocIdSetIterator[maxSize]; + final CountSlotAcc[] activeCountAccs = new CountSlotAcc[maxSize]; for (int subIdx = 0; subIdx < leaves.size(); subIdx++) { LeafReaderContext subCtx = leaves.get(subIdx); setNextReaderFirstPhase(subCtx); - DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs - DocIdSetIterator disi = dis.iterator(); + final SweepDISI disi = SweepDISI.newInstance(base, others, subIterators, activeCountAccs, subCtx); + if (disi == null) { + continue; + } + LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx); SortedDocValues singleDv = null; SortedSetDocValues multiDv = null; @@ -135,7 +147,13 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { // TODO: get sub from multi? multiDv = subCtx.reader().getSortedSetDocValues(sf.getName()); if (multiDv == null) { - multiDv = DocValues.emptySortedSet(); + if (countOnly) { + continue; + } else { + multiDv = DocValues.emptySortedSet(); + } + } else if (countOnly && multiDv.getValueCount() < 1){ + continue; } // some codecs may optimize SortedSet storage for single-valued fields // this will be null if this is not a wrapped single valued docvalues. @@ -145,12 +163,16 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { } else { singleDv = subCtx.reader().getSortedDocValues(sf.getName()); if (singleDv == null) { - singleDv = DocValues.emptySorted(); + if (countOnly) { + continue; + } else { + singleDv = DocValues.emptySorted(); + } + } else if (countOnly && singleDv.getValueCount() < 1) { + continue; } } - LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx); - if (singleDv != null) { if (accumSeg) { collectPerSeg(singleDv, disi, toGlobal); @@ -174,7 +196,7 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { } } - reuse = null; // better GC + Arrays.fill(reuse, null); // better GC } @Override @@ -182,9 +204,9 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { return si.lookupOrd(ord); } - private void collectPerSeg(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException { - int segMax = singleDv.getValueCount() + 1; - final int[] counts = getCountArr( segMax ); + private void collectPerSeg(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException { + int segMax = singleDv.getValueCount(); + final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax); /** alternate trial implementations // ord @@ -202,73 +224,110 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) { FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv; while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - counts[fc.getOrd(doc) + 1]++; + final int segOrd = fc.getOrd(doc); + if (segOrd >= 0) { + final int maxIdx = disi.registerCounts(segCounter); + segCounter.incrementCount(segOrd, 1, maxIdx); + } } } else { while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (singleDv.advanceExact(doc)) { - counts[singleDv.ordValue() + 1]++; + final int segOrd = singleDv.ordValue(); + if (segOrd >= 0) { + final int maxIdx = disi.registerCounts(segCounter); + segCounter.incrementCount(segOrd, 1, maxIdx); + } } } } // convert segment-local counts to global counts - for (int i=1; i 0) { - int slot = toGlobal == null ? (i - 1) : (int) toGlobal.get(i - 1); - countAcc.incrementCount(slot, segCount); - } - } + segCounter.register(disi.countAccs, toGlobal, segMax - 1); } - private void collectPerSeg(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException { + private SegCountPerSeg getSegCountPerSeg(SweepDISI disi, int segMax) { + final int size = disi.size; + return new SegCountPerSeg(getSegmentCountArrays(segMax, size), getBoolArr(segMax), segMax, size); + } + + private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedDocValues dv) { + return new SegCountGlobal(disi.countAccs); + } + + private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedSetDocValues dv) { + return new SegCountGlobal(disi.countAccs); + } + + private void collectPerSeg(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException { int segMax = (int)multiDv.getValueCount(); - final int[] counts = getCountArr( segMax ); + final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax); int doc; while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (multiDv.advanceExact(doc)) { - for(;;) { + final int maxIdx = disi.registerCounts(segCounter); + for (;;) { int segOrd = (int)multiDv.nextOrd(); if (segOrd < 0) break; - counts[segOrd]++; + segCounter.incrementCount(segOrd, 1, maxIdx); } } } - for (int i=0; i 0) { - int slot = toGlobal == null ? (i) : (int) toGlobal.get(i); - countAcc.incrementCount(slot, segCount); - } - } + segCounter.register(disi.countAccs, toGlobal, segMax - 1); } - private int[] reuse; - private int[] getCountArr(int maxNeeded) { - if (reuse == null) { + private boolean[] reuseBool; + private boolean[] getBoolArr(int maxNeeded) { + if (reuseBool == null) { // make the count array large enough for any segment // FUTURE: (optionally) directly use the array of the CountAcc for an optimized index.. - reuse = new int[(int) si.getValueCount() + 1]; + reuseBool = new boolean[(int) si.getValueCount() + 1]; } else { - Arrays.fill(reuse, 0, maxNeeded, 0); + Arrays.fill(reuseBool, 0, maxNeeded, false); } - return reuse; + return reuseBool; } - private void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException { + private int[][] reuse = new int[12][]; + private int[] getCountArr(int maxNeeded, int idx) { + if (idx >= reuse.length) { + reuse = Arrays.copyOf(reuse, idx + 1); + } + if (reuse[idx] == null) { + // make the count array large enough for any segment + // FUTURE: (optionally) directly use the array of the CountAcc for an optimized index.. + reuse[idx] = new int[(int) si.getValueCount() + 1]; + } else { + Arrays.fill(reuse[idx], 0, maxNeeded, 0); + } + return reuse[idx]; + } + + private int[][] getSegmentCountArrays(int segMax, int size) { + int[][] ret = new int[size][]; + int i = size - 1; + do { + ret[i] = getCountArr(segMax, i); + } while (i-- > 0); + return ret; + } + + private void collectDocs(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException { int doc; + final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv); while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (singleDv.advanceExact(doc)) { + final int maxIdx = disi.registerCounts(segCounter); int segOrd = singleDv.ordValue(); - collect(doc, segOrd, toGlobal); + collect(doc, segOrd, toGlobal, segCounter, maxIdx, disi.collectBase()); } } } - private void collectCounts(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException { + private void collectCounts(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException { + final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv); int doc; if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) { @@ -277,7 +336,8 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { int segOrd = fc.getOrd(doc); if (segOrd < 0) continue; int ord = (int)toGlobal.get(segOrd); - countAcc.incrementCount(ord, 1); + int maxIdx = disi.registerCounts(segCounter); + segCounter.incrementCount(ord, 1, maxIdx); } } else { @@ -286,53 +346,60 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray { if (singleDv.advanceExact(doc)) { int segOrd = singleDv.ordValue(); int ord = (int) toGlobal.get(segOrd); - countAcc.incrementCount(ord, 1); + int maxIdx = disi.registerCounts(segCounter); + segCounter.incrementCount(ord, 1, maxIdx); } } - } } - private void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException { + private void collectDocs(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException { + final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv); int doc; while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (multiDv.advanceExact(doc)) { + final int maxIdx = disi.registerCounts(segCounter); + final boolean collectBase = disi.collectBase(); for(;;) { int segOrd = (int)multiDv.nextOrd(); if (segOrd < 0) break; - collect(doc, segOrd, toGlobal); + collect(doc, segOrd, toGlobal, segCounter, maxIdx, collectBase); } } } } - private void collectCounts(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException { + private void collectCounts(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException { + final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv); int doc; while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (multiDv.advanceExact(doc)) { + final int maxIdx = disi.registerCounts(segCounter); for(;;) { int segOrd = (int)multiDv.nextOrd(); if (segOrd < 0) break; int ord = (int)toGlobal.get(segOrd); - countAcc.incrementCount(ord, 1); + segCounter.incrementCount(ord, 1, maxIdx); } } } } - private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException { + private void collect(int doc, int segOrd, LongValues toGlobal, SegCountGlobal segCounter, int maxIdx, boolean collectBase) throws IOException { int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd; int arrIdx = ord - startTermIndex; // This code handles faceting prefixes, which narrows the range of ords we want to collect. // It’s not an error for an ord to fall outside this range… we simply want to skip it. if (arrIdx >= 0 && arrIdx < nTerms) { - countAcc.incrementCount(arrIdx, 1); - if (collectAcc != null) { - collectAcc.collect(doc, arrIdx, slotContext); - } - if (allBucketsAcc != null) { - allBucketsAcc.collect(doc, arrIdx, slotContext); + segCounter.incrementCount(arrIdx, 1, maxIdx); + if (collectBase) { + if (collectAcc != null) { + collectAcc.collect(doc, arrIdx, slotContext); + } + if (allBucketsAcc != null) { + allBucketsAcc.collect(doc, arrIdx, slotContext); + } } } } diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java index c3d84eb2e3d..9170e2488f5 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java @@ -310,7 +310,6 @@ public abstract class FacetProcessor { // allow a custom count acc to be used if (countAcc == null) { countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount); - countAcc.key = "count"; } for (Map.Entry entry : freq.getFacetStats().entrySet()) { diff --git a/solr/core/src/java/org/apache/solr/search/facet/ReadOnlyCountSlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/ReadOnlyCountSlotAcc.java new file mode 100644 index 00000000000..8569324028a --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/ReadOnlyCountSlotAcc.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; + +/** + * To be implemented by CountSlotAccs that wish to expose a read-only interface + */ +interface ReadOnlyCountSlotAcc { + + public long getCount(int slot); + + public int compare(int slotA, int slotB); + + public Object getValue(int slotNum) throws IOException; +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java index 10146db4dd9..df4d11f5abc 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java +++ b/solr/core/src/java/org/apache/solr/search/facet/RelatednessAgg.java @@ -37,6 +37,7 @@ import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.search.DocSet; import org.apache.solr.search.QParser; +import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,6 +57,7 @@ public class RelatednessAgg extends AggValueSource { private static final String RELATEDNESS = "relatedness"; private static final String FG_POP = "foreground_popularity"; private static final String BG_POP = "background_popularity"; + public static final String SWEEP_COLLECTION = "sweep_collection"; // needed for distrib calculation private static final String FG_SIZE = "foreground_size"; @@ -66,8 +68,11 @@ public class RelatednessAgg extends AggValueSource { final protected Query fgQ; final protected Query bgQ; protected double min_pop = 0.0D; + private Boolean useSweep; public static final String NAME = RELATEDNESS; + private static final boolean DEFAULT_SWEEP_COLLECTION = true; + public RelatednessAgg(Query fgQ, Query bgQ) { super(NAME); // NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null @@ -87,7 +92,10 @@ public class RelatednessAgg extends AggValueSource { public void setOpts(QParser parser) { final boolean isShard = parser.getReq().getParams().getBool(ShardParams.IS_SHARD, false); SolrParams opts = parser.getLocalParams(); - if (null != opts) { + if (null == opts) { + this.useSweep = DEFAULT_SWEEP_COLLECTION; + } else { + this.useSweep = opts.getBool(SWEEP_COLLECTION, DEFAULT_SWEEP_COLLECTION); if (!isShard) { // ignore min_pop if this is a shard request this.min_pop = opts.getDouble("min_popularity", 0.0D); } @@ -97,7 +105,7 @@ public class RelatednessAgg extends AggValueSource { @Override public String description() { // TODO: need better output processing when we start supporting null fgQ/bgQ in constructor - return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop+")"; + return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop + ",useSweep="+useSweep+")"; } @Override @@ -163,9 +171,127 @@ public class RelatednessAgg extends AggValueSource { return new Merger(this); } + private static final class SweepSKGSlotAcc extends SlotAcc { + + private final int minCount; // pre-calculate for a given min_popularity + private final long fgSize; + private final long bgSize; + private final ReadOnlyCountSlotAcc fgCount; + private final ReadOnlyCountSlotAcc bgCount; + private double[] relatedness; + + private static final int NO_ALL_BUCKETS = -2; + private static final int ALL_BUCKETS_UNINITIALIZED = -1; + + // we can't get the allBuckets info from the slotContext in collect(), b/c the whole point of + // sweep collection is that the "collect" methods aren't called. + // So this is the compromise: note in construction either that we're using a processor w/NO_ALL_BUCKETS + // or that we don't know the bucket yet (ALL_BUCKETS_UNINITIALIZED) and fill it in in getValues + // where we can check against the processor + private int allBucketsSlot; + + public SweepSKGSlotAcc(double minPopularity, FacetContext fcontext, int numSlots, long fgSize, long bgSize, ReadOnlyCountSlotAcc fgCount, ReadOnlyCountSlotAcc bgCount) { + super(fcontext); + this.minCount = (int) Math.ceil(minPopularity * bgSize); + this.fgSize = fgSize; + this.bgSize = bgSize; + this.fgCount = fgCount; + this.bgCount = bgCount; + relatedness = new double[numSlots]; + Arrays.fill(relatedness, 0, numSlots, Double.NaN); + + // any processor that can (currently) result in the use of SweepSKGSlotAcc *should* be a + // FacetFieldProcessor -- but don't assume that will always be true... + this.allBucketsSlot = NO_ALL_BUCKETS; + if (fcontext.processor instanceof FacetFieldProcessor + // NOTE: if this instanceof/cast changes, getValues needs updated as well + && ((FacetFieldProcessor)fcontext.processor).freq.allBuckets) { + this.allBucketsSlot = ALL_BUCKETS_UNINITIALIZED; + } + } + + @Override + public void collect(int perSegDocId, int slot, IntFunction slotContext) throws IOException { + throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping"); + } + + @Override + public int collect(DocSet docs, int slot, IntFunction slotContext) throws IOException { + throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping"); + } + + private double getRelatedness(int slot) { + final double cachedRelatedness = relatedness[slot]; + if (Double.isNaN(cachedRelatedness)) { + final long fg_count = fgCount.getCount(slot); + final long bg_count = bgCount.getCount(slot); + if (minCount > 0) { + // if min_pop is configured, and either (fg|bg) popularity is lower then that value + // then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores + if (fg_count < minCount || bg_count < minCount) { + return relatedness[slot] = Double.NEGATIVE_INFINITY; + } + } + return relatedness[slot] = computeRelatedness(fg_count, fgSize, bg_count, bgSize); + } else { + return cachedRelatedness; + } + } + + public int compare(int slotA, int slotB) { + int r = Double.compare(getRelatedness(slotA), getRelatedness(slotB)); + if (0 == r) { + r = Long.compare(fgCount.getCount(slotA), fgCount.getCount(slotB)); + } + if (0 == r) { + r = Long.compare(bgCount.getCount(slotA), bgCount.getCount(slotB)); + } + return r; + } + + @Override + public Object getValue(int slotNum) { + final BucketData slotVal; + if (NO_ALL_BUCKETS != allBucketsSlot) { + // there's no reason why a processor should be resizing SlotAccs in the middle of getValue, + // but we're going to be vigilent against that possibility just in case... + if (ALL_BUCKETS_UNINITIALIZED == allBucketsSlot + || allBucketsSlot == slotNum) { + assert fcontext.processor instanceof FacetFieldProcessor + : "code changed, non FacetFieldProcessor sweeping w/allBuckets?!?"; + allBucketsSlot = ((FacetFieldProcessor)fcontext.processor).allBucketsAcc.collectAccSlot; + } + } + if (slotNum == allBucketsSlot) { + slotVal = new BucketData(null); + } else { + slotVal = new BucketData(fgCount.getCount(slotNum), fgSize, bgCount.getCount(slotNum), bgSize, getRelatedness(slotNum)); + } + return slotVal.externalize(fcontext.isShard()); + } + + @Override + public void reset() throws IOException { + Arrays.fill(relatedness, Double.NaN); + if (allBucketsSlot != NO_ALL_BUCKETS) { + allBucketsSlot = ALL_BUCKETS_UNINITIALIZED; + } + } + + @Override + public void resize(Resizer resizer) { + relatedness = resizer.resize(relatedness, Double.NaN); + } + + @Override + public void close() throws IOException { + relatedness = null; + } + } + private static final String IMPLIED_KEY = "implied"; - private static final class SKGSlotAcc extends SlotAcc { + private static final class SKGSlotAcc extends SlotAcc implements SweepableSlotAcc { private final RelatednessAgg agg; private BucketData[] slotvalues; private final DocSet fgSet; @@ -181,9 +307,30 @@ public class RelatednessAgg extends AggValueSource { // cache the set sizes for frequent re-use on every slot this.fgSize = fgSet.size(); this.bgSize = bgSet.size(); - this.slotvalues = new BucketData[numSlots]; + this.slotvalues = new BucketData[numSlots]; //TODO: avoid initializing array until we know we're not doing sweep collection? reset(); } + + /** + * If called, may register SweepingAccs for fg and bg set based on whether + * user indicated sweeping should be used (default) + * + * @returns null if any SweepingAccs were registered since no other collection is needed for relatedness + */ + @Override + public SKGSlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) { + if (!this.agg.useSweep) { + return this; + } else { + final ReadOnlyCountSlotAcc fgCount = baseSweepingAcc.add(key + "!fg", fgSet, slotvalues.length); + final ReadOnlyCountSlotAcc bgCount = baseSweepingAcc.add(key + "!bg", bgSet, slotvalues.length); + SweepSKGSlotAcc readOnlyReplacement = new SweepSKGSlotAcc(agg.min_pop, fcontext, slotvalues.length, fgSize, bgSize, fgCount, bgCount); + readOnlyReplacement.key = key; + baseSweepingAcc.registerMapping(this, readOnlyReplacement); + return null; + } + } + private void processSlot(int slot, IntFunction slotContext) throws IOException { assert null != slotContext; @@ -213,13 +360,18 @@ public class RelatednessAgg extends AggValueSource { assert null == fcontext.filter; } // ...and in which case we should just use the current base - final DocSet slotSet = null == slotQ ? fcontext.base : fcontext.searcher.getDocSet(slotQ); + final DocSet slotSet; + if (null == slotQ) { + slotSet = fcontext.base; + } else { + slotSet = fcontext.searcher.getDocSet(slotQ); + } slotVal.incSizes(fgSize, bgSize); slotVal.incCounts(fgSet.intersectionSize(slotSet), bgSet.intersectionSize(slotSet)); } - + @Override public void collect(int perSegDocId, int slot, IntFunction slotContext) throws IOException { // NOTE: we don't actaully care about the individual docs being collected @@ -334,6 +486,16 @@ public class RelatednessAgg extends AggValueSource { this.implied = true; } + public BucketData(long fg_count, long fg_size, long bg_count, long bg_size, double relatedness) { + this.fg_count = fg_count; + this.fg_size = fg_size; + this.fg_pop = roundTo5Digits((double) fg_count / bg_size); // yes, BACKGROUND size is intentional + this.bg_count = bg_count; + this.bg_size = bg_size; + this.bg_pop = roundTo5Digits((double) bg_count / bg_size); + this.relatedness = relatedness; + } + /** * Increment both the foreground & background counts for the current bucket, reseting any * derived values that may be cached diff --git a/solr/core/src/java/org/apache/solr/search/facet/SingletonDISI.java b/solr/core/src/java/org/apache/solr/search/facet/SingletonDISI.java new file mode 100644 index 00000000000..08be64b1783 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/SingletonDISI.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; + +final class SingletonDISI extends SweepDISI { + + private final DocIdSetIterator backing; + private final boolean isBase; + + SingletonDISI(DocIdSetIterator backing, CountSlotAcc[] countAccs, boolean isBase) { + super(1, countAccs); + this.backing = backing; + this.isBase = isBase; + } + + @Override + public int nextDoc() throws IOException { + return backing.nextDoc(); + } + + @Override + public boolean collectBase() { + return isBase; + } + + @Override + public int registerCounts(SegCounter segCounter) { + return 0; + } +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/SingletonDocIterator.java b/solr/core/src/java/org/apache/solr/search/facet/SingletonDocIterator.java new file mode 100644 index 00000000000..17311a6262e --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/SingletonDocIterator.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import org.apache.solr.search.DocIterator; + +final class SingletonDocIterator extends SweepDocIterator { + + private final DocIterator backing; + private final boolean isBase; + + SingletonDocIterator(DocIterator backing, boolean isBase) { + super(1); + this.backing = backing; + this.isBase = isBase; + } + + @Override + public boolean hasNext() { + return backing.hasNext(); + } + + @Override + public int nextDoc() { + return backing.nextDoc(); + } + + @Override + public boolean collectBase() { + return isBase; + } + + @Override + public int registerCounts(SegCounter segCounts) { + return 0; + } + +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java index e8555523ab8..d7d6e35c02a 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java +++ b/solr/core/src/java/org/apache/solr/search/facet/SlotAcc.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.function.IntFunction; @@ -52,6 +53,8 @@ public abstract class SlotAcc implements Closeable { this.fcontext = fcontext; } + @Override public String toString() { return key; } + /** * NOTE: this currently detects when it is being reused and calls resetIterators by comparing reader ords * with previous calls to setNextReader. For this reason, current users must call setNextReader @@ -597,9 +600,222 @@ public abstract class SlotAcc implements Closeable { } } - abstract static class CountSlotAcc extends SlotAcc { + /** + * Implemented by some SlotAccs if they are capable of being used for + * sweep collecting in compatible facet processors + * @see FacetFieldProcessor#registerSweepingAccIfSupportedByCollectAcc() + */ + static interface SweepableSlotAcc { + /** + * Called by processors if they support sweeping. Implementations will often + * return self or null (the latter indicating that all necessary collection will + * be covered by the "sweeping" data structures registered with the specified + * baseSweepingAcc as a result of the call to this method). + * + * If an implementing instance chooses to replace itself with another {@link SlotAcc}, it must + * call {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)} on the specified + * baseSweepingAcc to notify it of the mapping from original SlotAcc to the SlotAcc that should + * be used for purposes of read access. It is the responsibility of the specified {@link SweepingCountSlotAcc} + * to ensure proper placement/accessibility of the SlotAcc to be used for read access. + * + * The replacement SlotAcc registered via {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)} + * will be responsible for output via its {@link SlotAcc#setValues(SimpleOrderedMap, int)} method. + * An implementer of this method may register such a replacement, and also return a non-null + * SlotAcc to be used for normal collection (via {@link FacetFieldProcessor#collectAcc}). In this case, + * the implementer should take care that the returned {@link SlotAcc} is different from the {@link SlotAcc} + * registered for the purpose of output -- with the former overriding {@link SlotAcc#setValues(SimpleOrderedMap, int)} + * as a no-op, to prevent setting duplicate values. + * + * @param baseSweepingAcc - never null, where the SlotAcc may register domains for sweep collection, + * and must register mappings of new read-access SlotAccs that result from this call. + * @return SlotAcc to be used for purpose of collection. If null then collect methods will + * never be called on this SlotAcc. + */ + public T registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc); + } + + /** + * A simple data structure to {@link DocSet} domains with an associated {@link CountSlotAcc}. This may be used + * to support sweep count accumulation over different {@link DocSet} domains, but the concept is perfectly applicable + * to encapsulating the relevant state for simple "non-sweep" collection as well (in which case {@link SweepCountAccStruct#docSet} + * would be {@link FacetContext#base}, {@link SweepCountAccStruct#countAcc} would be {@link FacetProcessor#countAcc}, and + * {@link SweepCountAccStruct#isBase} would trivially be "true"). + */ + static final class SweepCountAccStruct { + final DocSet docSet; + final boolean isBase; + final CountSlotAcc countAcc; + public SweepCountAccStruct(DocSet docSet, boolean isBase, CountSlotAcc countAcc) { + this.docSet = docSet; + this.isBase = isBase; + this.countAcc = countAcc; + } + public SweepCountAccStruct(SweepCountAccStruct t, DocSet replaceDocSet) { + this.docSet = replaceDocSet; + this.isBase = t.isBase; + this.countAcc = t.countAcc; + } + /** + * Because sweep collection offloads "collect" methods to count accumulation code, + * it is helpful to provide a read-only view over the backing {@link CountSlotAcc} + * + * @return - a read-only view over {@link #countAcc} + */ + public ReadOnlyCountSlotAcc roCountAcc() { + return countAcc; + } + @Override public String toString() { + return this.countAcc.toString(); + } + } + + /** + * Special CountSlotAcc used by processors that support sweeping to decide what to sweep over and how to "collect" + * when doing the sweep. + * + * This class may be used by instances of {@link SweepableSlotAcc} to register DocSet domains (via {@link SweepingCountSlotAcc#add}) + * over which to sweep-collect facet counts. + * + * @see SweepableSlotAcc#registerSweepingAccs + */ + static class SweepingCountSlotAcc extends CountSlotArrAcc { + + static final String SWEEP_COLLECTION_DEBUG_KEY = "sweep_collection"; + private final SimpleOrderedMap debug; + private final FacetFieldProcessor p; + final SweepCountAccStruct base; + final List others = new ArrayList<>(); + private final List output = new ArrayList<>(); + + SweepingCountSlotAcc(int numSlots, FacetFieldProcessor p) { + super(p.fcontext, numSlots); + this.p = p; + this.base = new SweepCountAccStruct(fcontext.base, true, this); + final FacetDebugInfo fdebug = fcontext.getDebugInfo(); + this.debug = null != fdebug ? new SimpleOrderedMap<>() : null; + if (null != this.debug) { + fdebug.putInfoItem(SWEEP_COLLECTION_DEBUG_KEY, debug); + debug.add("base", key); + debug.add("accs", new ArrayList()); + debug.add("mapped", new ArrayList()); + } + } + + /** + * Called by SweepableSlotAccs to register new DocSet domains for sweep collection + * + * @param key + * assigned to the returned SlotAcc, and used for debugging + * @param docs + * the domain over which to sweep + * @param numSlots + * the number of slots + * @return a read-only representation of the count acc which is guaranteed to be populated after sweep count + * collection + */ + public ReadOnlyCountSlotAcc add(String key, DocSet docs, int numSlots) { + final CountSlotAcc count = new CountSlotArrAcc(fcontext, numSlots); + count.key = key; + final SweepCountAccStruct ret = new SweepCountAccStruct(docs, false, count); + if (null != debug) { + @SuppressWarnings("unchecked") + List accsDebug = (List) debug.get("accs"); + accsDebug.add(ret.toString()); + } + others.add(ret); + return ret.roCountAcc(); + } + + /** + * When a {@link SweepableSlotAcc} replaces itself (for the purpose of collection) with a different {@link SlotAcc} + * instance, it must register that replacement by calling this method with itself as the fromAcc param, and with the + * new replacement {@link SlotAcc} as the toAcc param. The two SlotAccs must have the same {@link SlotAcc#key}. + * + * It is the responsibility of this method to insure that {@link FacetFieldProcessor} references to fromAcc (other than + * those within {@link FacetFieldProcessor#collectAcc}, which are set directly by the return value of + * {@link SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc)}) are replaced + * by references to toAcc. Such references would include, e.g., {@link FacetFieldProcessor#sortAcc}. + * + * It is also this method's responsibility to insure that read access to toAcc (via toAcc's {@link SlotAcc#setValues(SimpleOrderedMap, int)} + * method) is provided via this instance's {@link #setValues(SimpleOrderedMap, int)} method. + * + * @param fromAcc - the {@link SlotAcc} to be replaced (this will normally be the caller of this method). + * @param toAcc - the replacement {@link SlotAcc} + * + * @see SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc) + */ + public void registerMapping(SlotAcc fromAcc, SlotAcc toAcc) { + assert fromAcc.key.equals(toAcc.key); + output.add(toAcc); + if (p.sortAcc == fromAcc) { + p.sortAcc = toAcc; + } + if (null != debug) { + @SuppressWarnings("unchecked") + List mappedDebug = (List) debug.get("mapped"); + mappedDebug.add(fromAcc.toString()); + } + } + + /** + * Always populates the bucket with the current count for that slot. If the count is positive, or if + * processEmpty==true, then this method also populates the values from mapped "output" accumulators. + * + * @see #setSweepValues + */ + @Override + public void setValues(SimpleOrderedMap bucket, int slotNum) throws IOException { + super.setValues(bucket, slotNum); + if (0 < getCount(slotNum) || fcontext.processor.freq.processEmpty) { + setSweepValues(bucket, slotNum); + } + } + + /** + * Populates the bucket with the values from all mapped "output" accumulators for the specified slot. + * + * This method exists because there are some contexts (namely SpecialSlotAcc, for allBuckets, etc.) in which "base" + * count is tracked differently, via getSpecialCount(). For such cases, we need a method that allows the caller to + * directly coordinate calling {@link SlotAcc#setValues} on the sweeping output accs, while avoiding the inclusion + * of {@link CountSlotAcc#setValues CountSlotAcc.setValues} + */ + public void setSweepValues(SimpleOrderedMap bucket, int slotNum) throws IOException { + for (SlotAcc acc : output) { + acc.setValues(bucket, slotNum); + } + } + + /** + * Helper method for code that wants to operating in a sweeping manner even if the current processor + * is not using sweeping. + * + * @returns struct that wraps the {@link FacetContext#base} unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc} + */ + public static SweepCountAccStruct baseStructOf(FacetProcessor processor) { + if (processor.countAcc instanceof SweepingCountSlotAcc) { + return ((SweepingCountSlotAcc) processor.countAcc).base; + } + return new SweepCountAccStruct(processor.fcontext.base, true, processor.countAcc); + } + /** + * Helper method for code that wants to operating in a sweeping manner even if the current processor + * is not using sweeping + * + * @returns empty list unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc} + */ + public static List otherStructsOf(FacetProcessor processor) { + if (processor.countAcc instanceof SweepingCountSlotAcc) { + return ((SweepingCountSlotAcc) processor.countAcc).others; + } + return Collections.emptyList(); + } + } + + abstract static class CountSlotAcc extends SlotAcc implements ReadOnlyCountSlotAcc { public CountSlotAcc(FacetContext fcontext) { super(fcontext); + // assume we are the 'count' by default unless/untill our creator overrides this + this.key = "count"; } public abstract void incrementCount(int slot, long count); diff --git a/solr/core/src/java/org/apache/solr/search/facet/SweepCountAware.java b/solr/core/src/java/org/apache/solr/search/facet/SweepCountAware.java new file mode 100644 index 00000000000..3dd83767020 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/SweepCountAware.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.LongValues; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; + +/** + * Implemented by extensions of doc iterators (i.e., {@link DocIdSetIterator}, {@link DocIterator} over one or + * more domain, to support facet count accumulation corresponding to each domain (and via {@link #collectBase()} + * to inform the necessity of "collection" for a single optional backing "base" set). + */ +interface SweepCountAware { + + /** + * Returns true if one of the domains underlying this iterator is the "base" domain, and if that base domain + * contains the doc on which this iterator is currently positioned. If "true", then "collection" may be necessary + * for the current doc. + * + * For each iterator position (each doc), {@link #registerCounts(SegCounter)} must be called before this method. + */ + boolean collectBase(); + + /** + * Called on a positioned doc iterator to register array index mappings for domains that contain the current + * doc. Implementations register these mappings by calling {@link SegCounter#map(int, int)} on the specified + * segCounts param. + * + * For each iterator position, this method must be called before {@link #collectBase()} + * + * @param segCounts - to register mappings of array indices for domains that contain this doc + * @return - the max index of an array representing the domains that contain the current doc. If "n" domains + * contain the current doc, the return value would be "n - 1" + * @throws IOException - if thrown by advancing an underlying doc iterator + */ + int registerCounts(SegCounter segCounts) throws IOException; + + /** + * Used to coordinate multiple count accumulations over multiple domains. Implementers will have "n" backing term-ord-indexed + * counts -- one for each domain over which count accumulation is to be performed. For each doc, count accumulation + * takes place in two phases, invoked by a "driver" (e.g., {@link FacetFieldProcessor}) that manages iteration over the + * union of doc domains: + * + * First, the driver passes this object as the param to {@link SweepCountAware#registerCounts(SegCounter)}, which + * calls {@link #map(int, int)} on "this" to map the static "allIdx" (allIdx < n) for each active backing domain to + * a transient "activeIdx" for counts corresponding to active domains (activeIdx < count(allIdx) <= n). (The return value + * of {@link SweepCountAware#registerCounts(SegCounter)} indicates to the "driver" the max "active counts" index (for + * domains that contain the current doc). + * + * The driver then calls {@link #incrementCount(int, int, int)}, passing the term ord, increment amount (usually "1"), + * and the max "active counts" index returned from {@link SweepCountAware#registerCounts(SegCounter)} in the first + * phase. The "max active counts index" param is used as the limit (inclusive) to iterate count accumulation over each + * of the "active" domains for the current doc. + * + * @see SweepCountAware#registerCounts(SegCounter) + */ + static interface SegCounter { + /** + * Mark/map a given domain/CountSlotAcc as active (eligible for count accumulation) for the current doc. + * + * @param allIdx - the static index of the domain/CountSlotAcc to be "activated" for the current doc + * @param activeIdx - the transient "active index" (for the purpose of actual count accumulation) to which to map + * the domain/CountSlotAcc indicated by "allIdx". + */ + void map(int allIdx, int activeIdx); + + /** + * Increments counts for active domains/CountSlotAccs. + * + * @param ord - the term ord (either global ord per-seg) for which to increment counts + * @param inc - the amount by which to increment the count for the specified term ord + * @param maxIdx - the max index (inclusive) of active domains/CountSlotAccs to be incremented for the current doc + */ + void incrementCount(int ord, int inc, int maxIdx); + } + + /** + * This class is designed to count over global term ords ({@link SegCountPerSeg} provides equivalent functionality for + * per-segment term ords). + * + * @see SegCountPerSeg + */ + static class SegCountGlobal implements SegCounter { + private final CountSlotAcc[] allCounts; + private final CountSlotAcc[] activeCounts; + + public SegCountGlobal(CountSlotAcc[] allCounts) { + this.allCounts = allCounts; + this.activeCounts = Arrays.copyOf(allCounts, allCounts.length); + } + + @Override + public void map(int allIdx, int activeIdx) { + activeCounts[activeIdx] = allCounts[allIdx]; + } + + @Override + public final void incrementCount(int globalOrd, int inc, int maxIdx) { + int i = maxIdx; + do { + activeCounts[i].incrementCount(globalOrd, inc); + } while (i-- > 0); + } + } + + /** + * This class is designed to count over per-segment term ords ({@link SegCountGlobal} provides equivalent functionality for + * global term ords). + * + * @see SegCountGlobal + */ + static class SegCountPerSeg implements SegCounter { + protected final int[][] allSegCounts; + private final int[][] activeSegCounts; + private final boolean[] seen; + + public SegCountPerSeg(int[][] allSegCounts, boolean[] seen, int segMax, int size) { + this.allSegCounts = allSegCounts; + this.activeSegCounts = Arrays.copyOf(this.allSegCounts, size); + this.seen = seen; + } + + @Override + public final void map(int allIdx, int activeIdx) { + activeSegCounts[activeIdx] = allSegCounts[allIdx]; + } + + @Override + public final void incrementCount(int segOrd, int inc, int maxIdx) { + seen[segOrd] = true; + int i = maxIdx; + do { + activeSegCounts[i][segOrd] += inc; + } while (i-- > 0); + } + + /** + * Maps accumulated per-segment term ords to global term ords and increments global slots on the specified countAccs + * accordingly. The index of each CountSlotAcc in the specified countAccs array must correspond to the + * the static index of its associated count accumulation doc domain and per-seg count array. + * + * @param countAccs - global-scope CountSlotAccs (one for each domain) to be incremented for the most recently accumulated + * segment + * @param toGlobal - mapping of per-segment term ords to global term ords for the most recently accumulated segment + * @param maxSegOrd - the max per-seg term ord for the most recently accumulated segment + */ + public void register(CountSlotAcc[] countAccs, LongValues toGlobal, int maxSegOrd) { + int segOrd = maxSegOrd; + final int maxIdx = countAccs.length - 1; + for (;;) { + if (seen[segOrd]) { + int i = maxIdx; + int slot = toGlobal == null ? segOrd : (int)toGlobal.get(segOrd); + do { + final int inc = allSegCounts[i][segOrd]; + if (inc > 0) { + countAccs[i].incrementCount(slot, inc); + } + } while (i-- > 0); + } + if (--segOrd < 0) { + break; + } + } + } + } + +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/SweepDISI.java b/solr/core/src/java/org/apache/solr/search/facet/SweepDISI.java new file mode 100644 index 00000000000..94c4261e197 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/SweepDISI.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; +import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct; + +public abstract class SweepDISI extends DocIdSetIterator implements SweepCountAware { + + public final int size; + final CountSlotAcc[] countAccs; + + public SweepDISI(int size, CountSlotAcc[] countAccs) { + this.size = size; + this.countAccs = countAccs; + } + + private static boolean addAcc(SweepCountAccStruct entry, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx, int idx) throws IOException { + final DocIdSet docIdSet = entry.docSet.getTopFilter().getDocIdSet(subCtx, null); + if (docIdSet == null || (subIterators[idx] = docIdSet.iterator()) == null) { + return false; + } + activeCountAccs[idx] = entry.countAcc; + return true; + } + + static SweepDISI newInstance(SweepCountAccStruct base, List others, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx) throws IOException { + int activeCt = 0; + final int baseIdx; + if (base == null || !addAcc(base, subIterators, activeCountAccs, subCtx, activeCt)) { + baseIdx = -1; + } else { + baseIdx = activeCt++; + } + for (SweepCountAccStruct entry : others) { + if (addAcc(entry, subIterators, activeCountAccs, subCtx, activeCt)) { + activeCt++; + } + } + switch (activeCt) { + case 0: + return null; + case 1: + return new SingletonDISI(subIterators[0], activeCountAccs, baseIdx >= 0); // solr docsets already exclude any deleted docs + default: + return new UnionDISI(subIterators, activeCountAccs, activeCt, baseIdx); + } + } + + @Override + public int docID() { + throw new UnsupportedOperationException("Not supported."); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException("Not supported."); + } + + @Override + public long cost() { + throw new UnsupportedOperationException("Not supported."); + } + +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/SweepDocIterator.java b/solr/core/src/java/org/apache/solr/search/facet/SweepDocIterator.java new file mode 100644 index 00000000000..7478d10d24c --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/SweepDocIterator.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; +import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct; + +abstract class SweepDocIterator implements DocIterator, SweepCountAware { + + public final int size; + + public SweepDocIterator(int size) { + this.size = size; + } + + static class SweepIteratorAndCounts { + final SweepDocIterator iter; + final CountSlotAcc[] countAccs; + public SweepIteratorAndCounts(SweepDocIterator iter, CountSlotAcc[] countAccs) { + this.iter = iter; + this.countAccs = countAccs; + } + } + + static SweepIteratorAndCounts newInstance(SweepCountAccStruct base, List others) throws IOException { + final int activeCt; + SweepCountAccStruct entry; + if (base == null) { + activeCt = others.size(); + entry = others.get(0); + } else { + activeCt = others.size() + 1; + entry = base; + } + if (activeCt == 1) { + final CountSlotAcc[] countAccs = new CountSlotAcc[] {entry.countAcc}; + return new SweepIteratorAndCounts(new SingletonDocIterator(entry.docSet.iterator(), base != null), countAccs); + } else { + final DocIterator[] subIterators = new DocIterator[activeCt]; + final CountSlotAcc[] countAccs = new CountSlotAcc[activeCt]; + Iterator othersIter = others.iterator(); + int i = 0; + for (;;) { + subIterators[i] = entry.docSet.iterator(); + countAccs[i] = entry.countAcc; + if (++i == activeCt) { + break; + } + entry = othersIter.next(); + } + return new SweepIteratorAndCounts(new UnionDocIterator(subIterators, base == null ? -1 : 0), countAccs); + } + } + + @Override + public float score() { + throw new UnsupportedOperationException("Not supported."); + } + + @Override + public Integer next() { + throw new UnsupportedOperationException("Not supported."); + } + + @Override + public abstract int registerCounts(SegCounter segCounts); // override to not throw IOException + +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java index 04f88f967d1..ee86fe04745 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java +++ b/solr/core/src/java/org/apache/solr/search/facet/UnInvertedField.java @@ -41,11 +41,15 @@ import org.apache.solr.index.SlowCompositeReaderWrapper; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.TrieField; import org.apache.solr.search.BitDocSet; -import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocSet; import org.apache.solr.search.SolrCache; import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal; +import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; import org.apache.solr.search.facet.SlotAcc.SlotContext; +import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct; +import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc; +import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts; import org.apache.solr.uninverting.DocTermOrds; import org.apache.solr.util.TestInjection; import org.slf4j.Logger; @@ -315,7 +319,7 @@ public class UnInvertedField extends DocTermOrds { - private void getCounts(FacetFieldProcessorByArrayUIF processor, SlotAcc.CountSlotAcc counts) throws IOException { + private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException { DocSet docs = processor.fcontext.base; int baseSize = docs.size(); int maxDoc = searcher.maxDoc(); @@ -325,9 +329,12 @@ public class UnInvertedField extends DocTermOrds { return; } + SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor); + final List others = SweepingCountSlotAcc.otherStructsOf(processor); + final int[] index = this.index; - boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet; + boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet && baseCountAccStruct != null; if (doNegative) { FixedBitSet bs = ((BitDocSet) docs).getBits().clone(); @@ -337,21 +344,34 @@ public class UnInvertedField extends DocTermOrds { docs = new BitDocSet(bs, maxDoc - baseSize); // simply negating will mean that we have deleted docs in the set. // that should be OK, as their entries in our table should be empty. + baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs); } // For the biggest terms, do straight set intersections for (TopTerm tt : bigTerms.values()) { // TODO: counts could be deferred if sorting by index order - counts.incrementCount(tt.termNum, searcher.numDocs(tt.termQuery, docs)); + final int termOrd = tt.termNum; + Iterator othersIter = others.iterator(); + SweepCountAccStruct entry = baseCountAccStruct != null ? baseCountAccStruct : othersIter.next(); + for (;;) { + entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet)); + if (!othersIter.hasNext()) { + break; + } + entry = othersIter.next(); + } } // TODO: we could short-circuit counting altogether for sorted faceting // where we already have enough terms from the bigTerms if (termInstances > 0) { - DocIterator iter = docs.iterator(); + final SweepIteratorAndCounts iterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others); + final SweepDocIterator iter = iterAndCounts.iter; + final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs); while (iter.hasNext()) { int doc = iter.nextDoc(); + int maxIdx = iter.registerCounts(counts); int code = index[doc]; if ((code & 0x80000000)!=0) { @@ -368,7 +388,7 @@ public class UnInvertedField extends DocTermOrds { } if (delta == 0) break; tnum += delta - TNUM_OFFSET; - counts.incrementCount(tnum,1); + counts.incrementCount(tnum, 1, maxIdx); } } else { int tnum = 0; @@ -378,7 +398,7 @@ public class UnInvertedField extends DocTermOrds { if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; - counts.incrementCount(tnum,1); + counts.incrementCount(tnum, 1, maxIdx); delta = 0; } code >>>= 8; @@ -388,9 +408,10 @@ public class UnInvertedField extends DocTermOrds { } if (doNegative) { + final CountSlotAcc baseCounts = processor.countAcc; for (int i=0; i= numTermsInField) { - getCounts(processor, processor.countAcc); + getCounts(processor); return; } @@ -427,15 +448,22 @@ public class UnInvertedField extends DocTermOrds { DocSet docs = processor.fcontext.base; int uniqueTerms = 0; - final SlotAcc.CountSlotAcc countAcc = processor.countAcc; + final CountSlotAcc countAcc = processor.countAcc; + final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor); + final List others = SweepingCountSlotAcc.otherStructsOf(processor); for (TopTerm tt : bigTerms.values()) { if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) { // handle the biggest terms - DocSet intersection = searcher.getDocSet(tt.termQuery, docs); + DocSet termSet = searcher.getDocSet(tt.termQuery); + DocSet intersection = termSet.intersection(docs); int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex, slotNum -> { return new SlotContext(tt.termQuery); }); - countAcc.incrementCount(tt.termNum - startTermIndex, collected); + final int termOrd = tt.termNum - startTermIndex; + countAcc.incrementCount(termOrd, collected); + for (SweepCountAccStruct entry : others) { + entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet)); + } if (collected > 0) { uniqueTerms++; } @@ -455,9 +483,14 @@ public class UnInvertedField extends DocTermOrds { // TODO: handle facet.prefix here!!! - DocIterator iter = docs.iterator(); + SweepIteratorAndCounts sweepIterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others); + final SweepDocIterator iter = sweepIterAndCounts.iter; + final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs; + final SegCountGlobal counts = new SegCountGlobal(countAccs); while (iter.hasNext()) { int doc = iter.nextDoc(); + int maxIdx = iter.registerCounts(counts); + boolean collectBase = iter.collectBase(); if (doc >= adjustedMax) { do { @@ -495,8 +528,10 @@ public class UnInvertedField extends DocTermOrds { int arrIdx = tnum - startTermIndex; if (arrIdx < 0) continue; if (arrIdx >= nTerms) break; - countAcc.incrementCount(arrIdx, 1); - processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); + counts.incrementCount(arrIdx, 1, maxIdx); + if (collectBase) { + processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); + } } } else { int tnum = 0; @@ -509,8 +544,10 @@ public class UnInvertedField extends DocTermOrds { int arrIdx = tnum - startTermIndex; if (arrIdx >= 0) { if (arrIdx >= nTerms) break; - countAcc.incrementCount(arrIdx, 1); - processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); + counts.incrementCount(arrIdx, 1, maxIdx); + if (collectBase) { + processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); + } } delta = 0; } diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnionDISI.java b/solr/core/src/java/org/apache/solr/search/facet/UnionDISI.java new file mode 100644 index 00000000000..8bd196880d4 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/UnionDISI.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.PriorityQueue; +import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; + +final class UnionDISI extends SweepDISI { + + final int maxIdx; + private final SubIterStruct baseSub; + private boolean collectBase; + private final PriorityQueue queue; + private SubIterStruct top; + private int docId = -1; + + private static final class SubIterStruct { + private final DocIdSetIterator sub; + private final int index; + private int docId; + public SubIterStruct(DocIdSetIterator sub, int index) throws IOException { + this.sub = sub; + this.index = index; + nextDoc(); + } + public void nextDoc() throws IOException { + docId = sub.nextDoc(); + } + } + UnionDISI(DocIdSetIterator[] subIterators, CountSlotAcc[] countAccs, int size, int baseIdx) throws IOException { + super(size, countAccs); + this.maxIdx = size - 1; + queue = new PriorityQueue(size) { + @Override + protected boolean lessThan(SubIterStruct a, SubIterStruct b) { + return a.docId < b.docId; + } + }; + int i = maxIdx; + SubIterStruct tmpBaseSub = null; + do { + final SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i); + queue.add(subIterStruct); + if (i == baseIdx) { + tmpBaseSub = subIterStruct; + } + } while (i-- > 0); + baseSub = tmpBaseSub; + top = queue.top(); + } + + @Override + public int nextDoc() throws IOException { + if (top.docId == docId) { + do { + top.nextDoc(); + } while ((top = queue.updateTop()).docId == docId); + } + if (baseSub != null) { + collectBase = false; + } + return docId = top.docId; + } + + @Override + public boolean collectBase() { + assert top.docId != docId : "must call registerCounts() before collectBase()"; + return collectBase; + } + + @Override + public int registerCounts(SegCounter segCounter) throws IOException { + int i = -1; + do { + if (!collectBase && top == baseSub) { + collectBase = true; + } + segCounter.map(top.index, ++i); + top.nextDoc(); + } while ((top = queue.updateTop()).docId == docId); + return i; + } + +} diff --git a/solr/core/src/java/org/apache/solr/search/facet/UnionDocIterator.java b/solr/core/src/java/org/apache/solr/search/facet/UnionDocIterator.java new file mode 100644 index 00000000000..448a49ce9d2 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/facet/UnionDocIterator.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search.facet; + +import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.solr.search.DocIterator; +import org.apache.lucene.util.PriorityQueue; + +final class UnionDocIterator extends SweepDocIterator { + + private final int maxIdx; + private final SubIterStruct baseSub; + private boolean collectBase; + private final PriorityQueue queue; + private SubIterStruct top; + private int docId = -1; + + private static final class SubIterStruct { + private final DocIterator sub; + private final int index; + private int docId; + public SubIterStruct(DocIterator sub, int index) throws IOException { + this.sub = sub; + this.index = index; + nextDoc(); + } + public void nextDoc() { + docId = sub.hasNext() ? sub.nextDoc() : DocIdSetIterator.NO_MORE_DOCS; + } + } + UnionDocIterator(DocIterator[] subIterators, int baseIdx) throws IOException { + super(subIterators.length); + this.maxIdx = size - 1; + queue = new PriorityQueue(size) { + @Override + protected boolean lessThan(SubIterStruct a, SubIterStruct b) { + return a.docId < b.docId; + } + }; + SubIterStruct tmpBase = null; + int i = maxIdx; + do { + SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i); + queue.add(subIterStruct); + if (i == baseIdx) { + tmpBase = subIterStruct; + } + } while (i-- > 0); + this.baseSub = tmpBase; + top = queue.top(); + } + + @Override + public int nextDoc() { + if (top.docId == docId) { + do { + top.nextDoc(); + } while ((top = queue.updateTop()).docId == docId); + } + collectBase = false; + return docId = top.docId; + } + + @Override + public boolean hasNext() { + if (top.docId == docId) { + do { + top.nextDoc(); + } while ((top = queue.updateTop()).docId == docId); + } + return top.docId != DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public boolean collectBase() { + assert top.docId != docId : "must call registerCounts() before collectBase()"; + return collectBase; + } + + @Override + public int registerCounts(SegCounter segCounts) { + int i = -1; + do { + if (!collectBase && top == baseSub) { + collectBase = true; + } + segCounts.map(top.index, ++i); + top.nextDoc(); + } while ((top = queue.updateTop()).docId == docId); + return i; + } +} diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java b/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java index 276dcb9f7b0..eb6866286c3 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestCloudJSONFacetSKGEquiv.java @@ -48,6 +48,7 @@ import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import static org.apache.solr.search.facet.FacetField.FacetMethod; +import static org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc.SWEEP_COLLECTION_DEBUG_KEY; import org.noggit.JSONUtil; import org.noggit.JSONWriter; @@ -58,12 +59,11 @@ import org.junit.BeforeClass; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** *

* A randomized test of nested facets using the relatedness() function, that asserts the * results are consistent and equivilent regardless of what method (ie: FacetFieldProcessor) - * is requested. + * and/or {@value RelatednessAgg#SWEEP_COLLECTION} option is requested. *

*

* This test is based on {@link TestCloudJSONFacetSKG} but does not @@ -280,6 +280,212 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { assertEquals(FacetFieldProcessorByHashDV.class.getSimpleName(), debug.get("processor")); } } + + /** + * Sanity check that our method of varying the {@value RelatednessAgg#SWEEP_COLLECTION} in conjunction with the + * method params works and can be verified by inspecting the debug output of basic requests. + */ + public void testWhiteboxSanitySweepDebug() throws Exception { + // NOTE: json.facet debugging output can be wonky, particularly when dealing with cloud + // so for these queries we keep it simple: + // - only one "top" facet per request + // - no refinement + // even with those constraints in place, a single facet can (may/sometimes?) produce multiple debug + // blocks - aparently due to shard merging? So... + // - only inspect the "first" debug NamedList in the results + // + + final SolrParams baseParams = params("rows","0", + "debug","true", // SOLR-14451 + // *:* is the only "safe" query for this test, + // to ensure we always have at least one bucket for every facet + // so we can be confident in getting the debug we expect... + "q", "*:*", + "fore", multiStrField(7)+":11", + "back", "*:*"); + + // simple individual facet that sorts on an skg stat... + // + // all results we test should be the same even if there is another 'skg_extra' stat, + // it shouldn't be involved in the sweeping at all. + for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) { + // choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified + final TermFacet f = new TermFacet(soloStrField(9), 10, 0, "skg desc", null); + if (null != extra) { + f.subFacets.put("skg_extra", extra); + } + final Map facets = new LinkedHashMap<>(); + facets.put("str", f); + + final SolrParams facetParams + = SolrParams.wrapDefaults(params("method_val", "dv", + "json.facet", Facet.toJSONFacetParamValue(facets)), + baseParams); + + // both default sweep option and explicit sweep should give same results... + for (SolrParams sweepParams : Arrays.asList(params(), + params("sweep_key", RelatednessAgg.SWEEP_COLLECTION, + "sweep_val", "true"))) { + final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams); + + final NamedList debug = getFacetDebug(params); + assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor")); + @SuppressWarnings("unchecked") + final NamedList sweep_debug = (NamedList) debug.get(SWEEP_COLLECTION_DEBUG_KEY); + assertNotNull(sweep_debug); + assertEquals("count", sweep_debug.get("base")); + assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs")); + assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped")); + } + { // 'dv' will always *try* to sweep, but disabling on stat should mean debug is mostly empty... + final SolrParams params = SolrParams.wrapDefaults(params("sweep_key", RelatednessAgg.SWEEP_COLLECTION, + "sweep_val", "false"), + facetParams); + final NamedList debug = getFacetDebug(params); + assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor")); + @SuppressWarnings("unchecked") + final NamedList sweep_debug = (NamedList) debug.get(SWEEP_COLLECTION_DEBUG_KEY); + assertNotNull(sweep_debug); + assertEquals("count", sweep_debug.get("base")); + assertEquals(Collections.emptyList(), sweep_debug.get("accs")); + assertEquals(Collections.emptyList(), sweep_debug.get("mapped")); + } + { // if we override 'dv' with 'hashdv' which doesn't sweep, our sweep debug should be empty, + // even if the skg stat does ask for sweeping explicitly... + final SolrParams params = SolrParams.wrapDefaults(params("method_val", "dvhash", + "sweep_key", RelatednessAgg.SWEEP_COLLECTION, + "sweep_val", "true"), + facetParams); + final NamedList debug = getFacetDebug(params); + assertEquals(FacetFieldProcessorByHashDV.class.getSimpleName(), debug.get("processor")); + assertNull(debug.get(SWEEP_COLLECTION_DEBUG_KEY)); + } + } + + // simple facet that sorts on an skg stat but uses prelim_sort on count + // + // all results we test should be the same even if there is another 'skg_extra' stat, + // neither skg should be involved in the sweeping at all. + for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) { + // choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified + final TermFacet f = new TermFacet(soloStrField(9), map("limit", 3, "overrequest", 0, + "sort", "skg desc", + "prelim_sort", "count asc")); + if (null != extra) { + f.subFacets.put("skg_extra", extra); + } + final Map facets = new LinkedHashMap<>(); + facets.put("str", f); + + final SolrParams facetParams + = SolrParams.wrapDefaults(params("method_val", "dv", + "json.facet", Facet.toJSONFacetParamValue(facets)), + baseParams); + + // default sweep as well as any explicit sweep=true/false values should give same results: no sweeping + for (SolrParams sweepParams : Arrays.asList(params(), + params("sweep_key", RelatednessAgg.SWEEP_COLLECTION, + "sweep_val", "false"), + params("sweep_key", RelatednessAgg.SWEEP_COLLECTION, + "sweep_val", "true"))) { + final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams); + + final NamedList debug = getFacetDebug(params); + assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor")); + @SuppressWarnings("unchecked") + final NamedList sweep_debug = (NamedList) debug.get(SWEEP_COLLECTION_DEBUG_KEY); + assertNotNull(sweep_debug); + assertEquals("count", sweep_debug.get("base")); + assertEquals(Collections.emptyList(), sweep_debug.get("accs")); + assertEquals(Collections.emptyList(), sweep_debug.get("mapped")); + } + } + + { // single facet with infinite limit + multiple skgs... + // this should trigger MultiAcc collection, causing sweeping on both skg functions + // + // all results we test should be the same even if there is another 'min' stat, + // in each term facet. it shouldn't affect the sweeping/MultiAcc at all. + for (Facet extra : Arrays.asList(null, new SumFacet(multiIntField(2)))) { + final Map facets = new LinkedHashMap<>(); + final TermFacet facet = new TermFacet(soloStrField(9), -1, 0, "skg2 desc", null); + facet.subFacets.put("skg2", new RelatednessFacet(multiStrField(2)+":9", null)); + if (null != extra) { + facet.subFacets.put("sum", extra); + } + facets.put("str", facet); + final SolrParams facetParams + = SolrParams.wrapDefaults(params("method_val", "dv", + "json.facet", Facet.toJSONFacetParamValue(facets)), + baseParams); + + // both default sweep option and explicit sweep should give same results... + for (SolrParams sweepParams : Arrays.asList(params(), + params("sweep_key", RelatednessAgg.SWEEP_COLLECTION, + "sweep_val", "true"))) { + final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams); + + final NamedList debug = getFacetDebug(params); + assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor")); + @SuppressWarnings("unchecked") + final NamedList sweep_debug = (NamedList) debug.get(SWEEP_COLLECTION_DEBUG_KEY); + assertNotNull(sweep_debug); + assertEquals("count", sweep_debug.get("base")); + assertEquals(Arrays.asList("skg!fg","skg!bg","skg2!fg","skg2!bg"), sweep_debug.get("accs")); + assertEquals(Arrays.asList("skg","skg2"), sweep_debug.get("mapped")); + } + } + } + + // nested facets that both sort on an skg stat + // (set limit + overrequest tiny to keep multishard response managable) + // + // all results we test should be the same even if there is another 'skg_extra' stat, + // in each term facet. they shouldn't be involved in the sweeping at all. + for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) { + // choose single value strings so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified + // choose 'id' for the parent facet so we are garunteed some child facets + final TermFacet parent = new TermFacet("id", 1, 0, "skg desc", false); + final TermFacet child = new TermFacet(soloStrField(7), 1, 0, "skg desc", false); + parent.subFacets.put("child", child); + if (null != extra) { + parent.subFacets.put("skg_extra", extra); + child.subFacets.put("skg_extra", extra); + } + final Map facets = new LinkedHashMap<>(); + facets.put("parent", parent); + + final SolrParams facetParams + = SolrParams.wrapDefaults(params("method_val", "dv", + "json.facet", Facet.toJSONFacetParamValue(facets)), + baseParams); + // both default sweep option and explicit sweep should give same results... + for (SolrParams sweepParams : Arrays.asList(params(), + params("sweep_key", RelatednessAgg.SWEEP_COLLECTION, + "sweep_val", "true"))) { + final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams); + + final NamedList parentDebug = getFacetDebug(params); + assertEquals("id", parentDebug.get("field")); + assertNotNull(parentDebug.get("sub-facet")); + // may be multiples from diff shards, just use first one + @SuppressWarnings("unchecked") + final NamedList childDebug = ((List>)parentDebug.get("sub-facet")).get(0); + assertEquals(soloStrField(7), childDebug.get("field")); + + // these should all be true for both the parent and the child debug.. + for (NamedList debug : Arrays.asList(parentDebug, childDebug)) { + assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor")); + @SuppressWarnings("unchecked") + final NamedList sweep_debug = (NamedList) debug.get(SWEEP_COLLECTION_DEBUG_KEY); + assertNotNull(sweep_debug); + assertEquals("count", sweep_debug.get("base")); + assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs")); + assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped")); + } + } + } + } /** * returns the FIRST NamedList (under the implicit 'null' FacetQuery) in the "facet-trace" output @@ -358,7 +564,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { } } - { // multi-valued facet field w/infinite limit and an extra (non-SKG) stat + { // multi-valued facet field w/infinite limit and an extra (non-SKG / non-sweeping) stat final TermFacet xxx = new TermFacet(multiStrField(12), -1, 0, "count asc", false); xxx.subFacets.put("sum", new SumFacet(multiIntField(4))); final Map facets = new LinkedHashMap<>(); @@ -414,7 +620,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { for (int limit : Arrays.asList(10, -1)) { for (String sort : Arrays.asList("count desc", "skg desc", "index asc")) { for (Boolean refine : Arrays.asList(false, true)) { - { // 1 additional (non-SKG) stat + { // 1 additional (non-SKG / non-sweeping) stat final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit, "overrequest", 0, "sort", sort, @@ -440,7 +646,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { multiStrField(0) + ":46"), multiStrField(5)+":9", "*:*"); } - { // multiple SKGs and a multiple non-SKG stats + { // multiple SKGs and a multiple non-SKG / non-sweeping stats final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit, "overrequest", 0, "sort", sort, @@ -504,6 +710,8 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { /** * Given a set of term facets, and top level query strings, asserts that * the results of these queries are identical even when varying the method_val param + * and when varying the {@value RelatednessAgg#SWEEP_COLLECTION} param; either by explicitly setting to + * true or false or by changing the param key to not set it at all. */ private void assertFacetSKGsAreConsistent(final Map facets, final String query, @@ -520,27 +728,33 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { @SuppressWarnings({"rawtypes"}) final NamedList expected = getFacetResponse(basicParams); - // now loop over all processors and compare them to the "default"... + // now loop over all permutations of processors and sweep values and and compare them to the "default"... for (FacetMethod method : EnumSet.allOf(FacetMethod.class)) { - ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT)); + for (Boolean sweep : Arrays.asList(true, false, null)) { + final ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT)); + if (null != sweep) { + options.add("sweep_key", RelatednessAgg.SWEEP_COLLECTION); + options.add("sweep_val", sweep.toString()); + } - @SuppressWarnings({"rawtypes"}) - final NamedList actual = getFacetResponse(SolrParams.wrapAppended(options, basicParams)); - - // we can't rely on a trivial assertEquals() comparison... - // - // the order of the sub-facet keys can change between - // processors. (notably: method:enum vs method:smart when sort:"index asc") - // - // NOTE: this doesn't ignore the order of the buckets, - // it ignores the order of the keys in each bucket... - final String pathToMismatch = BaseDistributedSearchTestCase.compare - (expected, actual, 0, - Collections.singletonMap("buckets", BaseDistributedSearchTestCase.UNORDERED)); - if (null != pathToMismatch) { - log.error("{}: expected = {}", options, expected); - log.error("{}: actual = {}", options, actual); - fail("Mismatch: " + pathToMismatch + " using " + options); + @SuppressWarnings({"rawtypes"}) + final NamedList actual = getFacetResponse(SolrParams.wrapAppended(options, basicParams)); + + // we can't rely on a trivial assertEquals() comparison... + // + // the order of the sub-facet keys can change between + // processors. (notably: method:enum vs method:smart when sort:"index asc") + // + // NOTE: this doesn't ignore the order of the buckets, + // it ignores the order of the keys in each bucket... + final String pathToMismatch = BaseDistributedSearchTestCase.compare + (expected, actual, 0, + Collections.singletonMap("buckets", BaseDistributedSearchTestCase.UNORDERED)); + if (null != pathToMismatch) { + log.error("{}: expected = {}", options, expected); + log.error("{}: actual = {}", options, actual); + fail("Mismatch: " + pathToMismatch + " using " + options); + } } } } catch (AssertionError e) { @@ -617,6 +831,10 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { * unless they are 'null' in which case $fore and $back refs will be used * in their place, and must be set as request params (this allows "random" facets to still easily * trigger the "nested facets re-using the same fore/back set for SKG situation) + * + * The JSON for all of these facets includes a ${sweep_key:xxx} (which will be ignored + * by default) and ${sweep_val:yyy} which may be set as params on each request to override the + * implicit default sweeping behavior of the underlying SKGAcc. */ private static final class RelatednessFacet implements Facet, Writable { public final Map jsonData = new LinkedHashMap<>(); @@ -641,7 +859,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase { // we don't allow these to be overridden by options, so set them now... jsonData.put("type", "func"); jsonData.put("func", "relatedness("+f+","+b+")"); - + jsonData.put("${sweep_key:xxx}","${sweep_val:yyy}"); } @Override public void write(JSONWriter writer) { diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index d12c27fc7c5..8a851d05ccc 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -763,6 +763,54 @@ public class TestJsonFacets extends SolrTestCaseHS { + " } } ] } }"); } + @Test + public void testSKGSweepMultiAcc() throws Exception { + Client client = Client.localClient(); + indexSimple(client); + + // simple single level facet w/skg & trivial non-sweeping stat using various sorts & (re)sorting + for (String sort : Arrays.asList("sort:'index asc'", + "sort:'y desc'", + "sort:'z desc'", + "sort:'skg desc'", + "prelim_sort:'count desc', sort:'index asc'", + "prelim_sort:'count desc', sort:'y desc'", + "prelim_sort:'count desc', sort:'z desc'", + "prelim_sort:'count desc', sort:'skg desc'")) { + // the relatedness score of each of our cat_s values is (conviniently) also alphabetical order, + // (and the same order as 'sum(num_i) desc' & 'min(num_i) desc') + // + // So all of these re/sort options should produce identical output + // - Testing "index" sort allows the randomized use of "stream" processor as default to be tested. + // - Testing (re)sorts on other stats sanity checks code paths where relatedness() is a "defered" Agg + + for (String sweep : Arrays.asList("true", "false")) { + // results should be the same even if we disable sweeping... + assertJQ(req("q", "cat_s:[* TO *]", "rows", "0", + "fore", "where_s:NY", "back", "*:*", + "json.facet", "" + + "{x: { type: terms, field: 'cat_s', "+sort+", limit:-1, " + + " facet: { skg: { type: 'func', func:'relatedness($fore,$back)', " + +" "+RelatednessAgg.SWEEP_COLLECTION+": "+sweep+" }," + + " y:'sum(num_i)', " + +" z:'min(num_i)' } } }") + , "facets=={count:5, x:{ buckets:[" + + " { val:'A', count:2, y:5.0, z:2, " + + " skg : { relatedness: 0.00554, " + + " foreground_popularity: 0.16667," + + " background_popularity: 0.33333, }," + + " }, " + + " { val:'B', count:3, y:-3.0, z:-5, " + + " skg : { relatedness: 0.0, " // perfectly average and uncorrolated + + " foreground_popularity: 0.16667," + + " background_popularity: 0.5 }," + + " } ] } } " + ); + } + } + } + + @Test public void testRepeatedNumerics() throws Exception { Client client = Client.localClient(); diff --git a/solr/solr-ref-guide/src/json-facet-api.adoc b/solr/solr-ref-guide/src/json-facet-api.adoc index 5f636adc929..bc336c3de21 100644 --- a/solr/solr-ref-guide/src/json-facet-api.adoc +++ b/solr/solr-ref-guide/src/json-facet-api.adoc @@ -919,6 +919,10 @@ NOTE: While it's very common to define the Background Set as `\*:*`, or some oth When using the extended `type:func` syntax for specifying a `relatedness()` aggregation, an optional `min_popularity` (float) option can be used to specify a lower bound on the `foreground_popularity` and `background_popularity` values, that must be met in order for the `relatedness` score to be valid -- If this `min_popularity` is not met, then the `relatedness` score will be `-Infinity`. +The default implementation for calculating `relatedness()` domain correlation depends on the type of facet being calculated. Generic domain correlation is calculated per-term, by selectively retrieving a DocSet for each bucket-associated query (consulting the `filterCache`) and calculating DocSet intersections with "foreground" and "background" sets. For term facets (especially over high-cardinality fields) this approach can lead to `filterCache` thrashing; accordingly, `relatedness()` over term facets defaults where possible to an approach that collects facet counts directly over all multiple domains in a single sweep (never touching the `filterCache`). It is possible to explicitly control this "single sweep" collection by setting the extended `type:func` syntax `sweep_collection` option to `true` (the default) or `false` (to disable sweep collection). + +NOTE: Disabling sweep collection for `relatedness()` stats over low-cardinality fields may yield a performance benefit, provided the `filterCache` is sufficiently large to accommodate an entry for each value in the associated field without inducing thrashing for anticipated use patterns. A reasonable heuristic is that fields of cardinality less than 1,000 _may_ benefit from disabling sweep. This heuristic is _not_ used to determine default behavior, particularly because non-sweep collection can so easily induce `filterCache` thrashing, with system-wide detrimental effects. + [source,json] ---- { "type": "func",