SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()"

This adds a lot of "under the covers" improvements to how JSON Faceting FacetField processors work, to enable
"sweeping" support when the SlotAcc used for sorting support it (currently just "relatedness()")

This is a squash commit of all changes on https://github.com/magibney/lucene-solr/tree/SOLR-13132
Up to and including ca7a8e0b39840d00af9022c048346a7d84bf280d.

Co-authored-by: Chris Hostetter <hossman@apache.org>
Co-authored-by: Michael Gibney <michael@michaelgibney.net>
This commit is contained in:
Michael Gibney 2020-07-09 18:42:37 -07:00 committed by Chris Hostetter
parent 5a422db60e
commit 40e2122b5a
19 changed files with 1663 additions and 113 deletions

View File

@ -129,6 +129,9 @@ Optimizations
* SOLR-14610: ReflectMapWriter to use MethodHandle instead of old reflection (noble)
* SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()"
(hossman, Michael Gibney)
Bug Fixes
---------------------
(No changes)

View File

@ -33,6 +33,7 @@ import java.util.function.IntFunction;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.PriorityQueue;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
@ -40,6 +41,8 @@ import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
@ -116,7 +119,6 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
// allow a custom count acc to be used
if (countAcc == null) {
countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount);
countAcc.key = "count";
}
if (accs != null) {
@ -509,12 +511,12 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
/** Helper method used solely when looping over buckets to be returned in findTopSlots */
private void fillBucketFromSlot(SimpleOrderedMap<Object> target, Slot slot,
SlotAcc resortAcc) throws IOException {
final long count = countAcc.getCount(slot.slot);
target.add("count", count);
if (count <= 0 && !freq.processEmpty) return;
final int slotOrd = slot.slot;
countAcc.setValues(target, slotOrd);
if (countAcc.getCount(slotOrd) <= 0 && !freq.processEmpty) return;
if (collectAcc != null && slot.slot >= 0) {
collectAcc.setValues(target, slot.slot);
if (slotOrd >= 0 && collectAcc != null) {
collectAcc.setValues(target, slotOrd);
}
if (otherAccs == null && freq.subFacets.isEmpty()) return;
@ -689,7 +691,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
}
}
static class MultiAcc extends SlotAcc {
static class MultiAcc extends SlotAcc implements SweepableSlotAcc<SlotAcc> {
final SlotAcc[] subAccs;
MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) {
@ -741,6 +743,65 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
acc.setValues(bucket, slotNum);
}
}
@Override
public SlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
final FacetFieldProcessor p = (FacetFieldProcessor) fcontext.processor;
int j = 0;
for (int i = 0; i < subAccs.length; i++) {
final SlotAcc acc = subAccs[i];
if (acc instanceof SweepableSlotAcc) {
SlotAcc replacement = ((SweepableSlotAcc<?>)acc).registerSweepingAccs(baseSweepingAcc);
if (replacement == null) {
// drop acc, do not increment j
continue;
} else if (replacement != acc || j < i) {
subAccs[j] = replacement;
}
} else if (j < i) {
subAccs[j] = acc;
}
j++;
}
switch (j) {
case 0:
return null;
case 1:
return subAccs[0];
default:
if (j == subAccs.length) {
return this;
} else {
// must resize final field subAccs
return new MultiAcc(fcontext, ArrayUtil.copyOfSubArray(subAccs, 0, j));
}
}
}
}
/**
* Helper method that subclasses can use to indicate they with to use sweeping.
* If {@link #countAcc} and {@link #collectAcc} support sweeping, then this method will:
* <ul>
* <li>replace {@link #collectAcc} with it's sweeping equivalent</li>
* <li>update {@link #allBucketsAcc}'s reference to {@link #collectAcc} (if it exists)</li>
* </ul>
*
* @return true if the above actions were taken
* @see SweepableSlotAcc
* @see SweepingCountSlotAcc
*/
protected boolean registerSweepingAccIfSupportedByCollectAcc() {
if (countAcc instanceof SweepingCountSlotAcc && collectAcc instanceof SweepableSlotAcc) {
final SweepingCountSlotAcc sweepingCountAcc = (SweepingCountSlotAcc)countAcc;
collectAcc = ((SweepableSlotAcc<?>)collectAcc).registerSweepingAccs(sweepingCountAcc);
if (allBucketsAcc != null) {
allBucketsAcc.collectAcc = collectAcc;
allBucketsAcc.sweepingCountAcc = sweepingCountAcc;
}
return true;
}
return false;
}
private static final SlotContext ALL_BUCKETS_SLOT_CONTEXT = new SlotContext(null) {
@ -766,6 +827,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
int collectAccSlot;
int otherAccsSlot;
long count;
SweepingCountSlotAcc sweepingCountAcc; // null unless/until sweeping is initialized
SpecialSlotAcc(FacetContext fcontext, SlotAcc collectAcc, int collectAccSlot, SlotAcc[] otherAccs, int otherAccsSlot) {
super(fcontext);
@ -822,6 +884,9 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
@Override
public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
if (sweepingCountAcc != null) {
sweepingCountAcc.setSweepValues(bucket, collectAccSlot);
}
if (collectAcc != null) {
collectAcc.setValues(bucket, collectAccSlot);
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.search.Query;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
@ -34,6 +35,9 @@ import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
* Base class for DV/UIF accumulating counts into an array by ordinal. It's
* for {@link org.apache.lucene.index.SortedDocValues} and {@link org.apache.lucene.index.SortedSetDocValues} only.
* It can handle terms (strings), not numbers directly but those encoded as terms, and is multi-valued capable.
* By default, this class assumes subclasses can support sweeping collection unless subclasses initialize <code>countAcc</code> directly in their constructors.
*
* @see SweepingCountSlotAcc
*/
abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
BytesRefBuilder prefixRef;
@ -56,6 +60,34 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
/** this BytesRef may be shared across calls and should be deep-cloned if necessary */
abstract protected BytesRef lookupOrd(int ord) throws IOException;
/**
* {@inheritDoc}
*
* This impl first initializes <code>countAcc</code> as a {@link SweepingCountSlotAcc} if null.
*/
@Override
protected void createAccs(long docCount, int slotCount) throws IOException {
if (countAcc == null) {
countAcc = new SweepingCountSlotAcc(slotCount, this);
}
super.createAccs(docCount, slotCount);
}
/**
* {@inheritDoc}
*
* This impl first initializes <code>countAcc</code> as a {@link SweepingCountSlotAcc} if null.
*/
@Override
void createCollectAcc(int numDocs, int numSlots) throws IOException {
if (countAcc == null) {
countAcc = new SweepingCountSlotAcc(numSlots, this);
}
super.createCollectAcc(numDocs, numSlots);
registerSweepingAccIfSupportedByCollectAcc();
}
@Override
public void process() throws IOException {
super.process();
@ -87,8 +119,10 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
if (freq.allBuckets) {
// count is irrelevant, but hardcoded in collect(...), so intercept/mask normal counts.
// Set here to prevent createAccs(...) from creating a 1-slot countAcc that will fail with AIOOBE
// NOTE: because collectAcc will be null, it is fine/irrelevant to set a countAcc that doesn't support sweeping
countAcc = SlotAcc.DEV_NULL_SLOT_ACC;
createAccs(nDocs, 1);
assert collectAcc == null;
otherAccs = accs; // accs is created above and set on allBucketsAcc; but during collection, setNextReader is called on otherAccs.
allBucketsAcc = new SpecialSlotAcc(fcontext, null, -1, accs, 0);
collectDocs();

View File

@ -26,14 +26,17 @@ import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.Filter;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
import org.apache.solr.search.facet.SweepCountAware.SegCountPerSeg;
import org.apache.solr.uninverting.FieldCacheImpl;
/**
@ -94,6 +97,10 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
return;
}
final SweepCountAccStruct base = SweepingCountSlotAcc.baseStructOf(this);
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(this);
assert null != base;
// TODO: refactor some of this logic into a base class
boolean countOnly = collectAcc==null && allBucketsAcc==null;
boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
@ -118,16 +125,21 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
if (freq.perSeg != null) accumSeg = canDoPerSeg && freq.perSeg; // internal - override perSeg heuristic
final int maxSize = others.size() + 1; // others + base
final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
Filter filter = fcontext.base.getTopFilter();
final DocIdSetIterator[] subIterators = new DocIdSetIterator[maxSize];
final CountSlotAcc[] activeCountAccs = new CountSlotAcc[maxSize];
for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
LeafReaderContext subCtx = leaves.get(subIdx);
setNextReaderFirstPhase(subCtx);
DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs
DocIdSetIterator disi = dis.iterator();
final SweepDISI disi = SweepDISI.newInstance(base, others, subIterators, activeCountAccs, subCtx);
if (disi == null) {
continue;
}
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
SortedDocValues singleDv = null;
SortedSetDocValues multiDv = null;
@ -135,7 +147,13 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
// TODO: get sub from multi?
multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
if (multiDv == null) {
multiDv = DocValues.emptySortedSet();
if (countOnly) {
continue;
} else {
multiDv = DocValues.emptySortedSet();
}
} else if (countOnly && multiDv.getValueCount() < 1){
continue;
}
// some codecs may optimize SortedSet storage for single-valued fields
// this will be null if this is not a wrapped single valued docvalues.
@ -145,12 +163,16 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
} else {
singleDv = subCtx.reader().getSortedDocValues(sf.getName());
if (singleDv == null) {
singleDv = DocValues.emptySorted();
if (countOnly) {
continue;
} else {
singleDv = DocValues.emptySorted();
}
} else if (countOnly && singleDv.getValueCount() < 1) {
continue;
}
}
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
if (singleDv != null) {
if (accumSeg) {
collectPerSeg(singleDv, disi, toGlobal);
@ -174,7 +196,7 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
}
}
reuse = null; // better GC
Arrays.fill(reuse, null); // better GC
}
@Override
@ -182,9 +204,9 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
return si.lookupOrd(ord);
}
private void collectPerSeg(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
int segMax = singleDv.getValueCount() + 1;
final int[] counts = getCountArr( segMax );
private void collectPerSeg(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
int segMax = singleDv.getValueCount();
final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);
/** alternate trial implementations
// ord
@ -202,73 +224,110 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {
FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
counts[fc.getOrd(doc) + 1]++;
final int segOrd = fc.getOrd(doc);
if (segOrd >= 0) {
final int maxIdx = disi.registerCounts(segCounter);
segCounter.incrementCount(segOrd, 1, maxIdx);
}
}
} else {
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (singleDv.advanceExact(doc)) {
counts[singleDv.ordValue() + 1]++;
final int segOrd = singleDv.ordValue();
if (segOrd >= 0) {
final int maxIdx = disi.registerCounts(segCounter);
segCounter.incrementCount(segOrd, 1, maxIdx);
}
}
}
}
// convert segment-local counts to global counts
for (int i=1; i<segMax; i++) {
int segCount = counts[i];
if (segCount > 0) {
int slot = toGlobal == null ? (i - 1) : (int) toGlobal.get(i - 1);
countAcc.incrementCount(slot, segCount);
}
}
segCounter.register(disi.countAccs, toGlobal, segMax - 1);
}
private void collectPerSeg(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
private SegCountPerSeg getSegCountPerSeg(SweepDISI disi, int segMax) {
final int size = disi.size;
return new SegCountPerSeg(getSegmentCountArrays(segMax, size), getBoolArr(segMax), segMax, size);
}
private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedDocValues dv) {
return new SegCountGlobal(disi.countAccs);
}
private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedSetDocValues dv) {
return new SegCountGlobal(disi.countAccs);
}
private void collectPerSeg(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
int segMax = (int)multiDv.getValueCount();
final int[] counts = getCountArr( segMax );
final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (multiDv.advanceExact(doc)) {
for(;;) {
final int maxIdx = disi.registerCounts(segCounter);
for (;;) {
int segOrd = (int)multiDv.nextOrd();
if (segOrd < 0) break;
counts[segOrd]++;
segCounter.incrementCount(segOrd, 1, maxIdx);
}
}
}
for (int i=0; i<segMax; i++) {
int segCount = counts[i];
if (segCount > 0) {
int slot = toGlobal == null ? (i) : (int) toGlobal.get(i);
countAcc.incrementCount(slot, segCount);
}
}
segCounter.register(disi.countAccs, toGlobal, segMax - 1);
}
private int[] reuse;
private int[] getCountArr(int maxNeeded) {
if (reuse == null) {
private boolean[] reuseBool;
private boolean[] getBoolArr(int maxNeeded) {
if (reuseBool == null) {
// make the count array large enough for any segment
// FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
reuse = new int[(int) si.getValueCount() + 1];
reuseBool = new boolean[(int) si.getValueCount() + 1];
} else {
Arrays.fill(reuse, 0, maxNeeded, 0);
Arrays.fill(reuseBool, 0, maxNeeded, false);
}
return reuse;
return reuseBool;
}
private void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
private int[][] reuse = new int[12][];
private int[] getCountArr(int maxNeeded, int idx) {
if (idx >= reuse.length) {
reuse = Arrays.copyOf(reuse, idx + 1);
}
if (reuse[idx] == null) {
// make the count array large enough for any segment
// FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
reuse[idx] = new int[(int) si.getValueCount() + 1];
} else {
Arrays.fill(reuse[idx], 0, maxNeeded, 0);
}
return reuse[idx];
}
private int[][] getSegmentCountArrays(int segMax, int size) {
int[][] ret = new int[size][];
int i = size - 1;
do {
ret[i] = getCountArr(segMax, i);
} while (i-- > 0);
return ret;
}
private void collectDocs(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
int doc;
final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (singleDv.advanceExact(doc)) {
final int maxIdx = disi.registerCounts(segCounter);
int segOrd = singleDv.ordValue();
collect(doc, segOrd, toGlobal);
collect(doc, segOrd, toGlobal, segCounter, maxIdx, disi.collectBase());
}
}
}
private void collectCounts(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
private void collectCounts(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
int doc;
if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {
@ -277,7 +336,8 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
int segOrd = fc.getOrd(doc);
if (segOrd < 0) continue;
int ord = (int)toGlobal.get(segOrd);
countAcc.incrementCount(ord, 1);
int maxIdx = disi.registerCounts(segCounter);
segCounter.incrementCount(ord, 1, maxIdx);
}
} else {
@ -286,53 +346,60 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
if (singleDv.advanceExact(doc)) {
int segOrd = singleDv.ordValue();
int ord = (int) toGlobal.get(segOrd);
countAcc.incrementCount(ord, 1);
int maxIdx = disi.registerCounts(segCounter);
segCounter.incrementCount(ord, 1, maxIdx);
}
}
}
}
private void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
private void collectDocs(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (multiDv.advanceExact(doc)) {
final int maxIdx = disi.registerCounts(segCounter);
final boolean collectBase = disi.collectBase();
for(;;) {
int segOrd = (int)multiDv.nextOrd();
if (segOrd < 0) break;
collect(doc, segOrd, toGlobal);
collect(doc, segOrd, toGlobal, segCounter, maxIdx, collectBase);
}
}
}
}
private void collectCounts(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
private void collectCounts(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
int doc;
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (multiDv.advanceExact(doc)) {
final int maxIdx = disi.registerCounts(segCounter);
for(;;) {
int segOrd = (int)multiDv.nextOrd();
if (segOrd < 0) break;
int ord = (int)toGlobal.get(segOrd);
countAcc.incrementCount(ord, 1);
segCounter.incrementCount(ord, 1, maxIdx);
}
}
}
}
private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException {
private void collect(int doc, int segOrd, LongValues toGlobal, SegCountGlobal segCounter, int maxIdx, boolean collectBase) throws IOException {
int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd;
int arrIdx = ord - startTermIndex;
// This code handles faceting prefixes, which narrows the range of ords we want to collect.
// Its not an error for an ord to fall outside this range we simply want to skip it.
if (arrIdx >= 0 && arrIdx < nTerms) {
countAcc.incrementCount(arrIdx, 1);
if (collectAcc != null) {
collectAcc.collect(doc, arrIdx, slotContext);
}
if (allBucketsAcc != null) {
allBucketsAcc.collect(doc, arrIdx, slotContext);
segCounter.incrementCount(arrIdx, 1, maxIdx);
if (collectBase) {
if (collectAcc != null) {
collectAcc.collect(doc, arrIdx, slotContext);
}
if (allBucketsAcc != null) {
allBucketsAcc.collect(doc, arrIdx, slotContext);
}
}
}
}

View File

@ -310,7 +310,6 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
// allow a custom count acc to be used
if (countAcc == null) {
countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount);
countAcc.key = "count";
}
for (Map.Entry<String,AggValueSource> entry : freq.getFacetStats().entrySet()) {

View File

@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
/**
* To be implemented by CountSlotAccs that wish to expose a read-only interface
*/
interface ReadOnlyCountSlotAcc {
public long getCount(int slot);
public int compare(int slotA, int slotB);
public Object getValue(int slotNum) throws IOException;
}

View File

@ -37,6 +37,7 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -56,6 +57,7 @@ public class RelatednessAgg extends AggValueSource {
private static final String RELATEDNESS = "relatedness";
private static final String FG_POP = "foreground_popularity";
private static final String BG_POP = "background_popularity";
public static final String SWEEP_COLLECTION = "sweep_collection";
// needed for distrib calculation
private static final String FG_SIZE = "foreground_size";
@ -66,8 +68,11 @@ public class RelatednessAgg extends AggValueSource {
final protected Query fgQ;
final protected Query bgQ;
protected double min_pop = 0.0D;
private Boolean useSweep;
public static final String NAME = RELATEDNESS;
private static final boolean DEFAULT_SWEEP_COLLECTION = true;
public RelatednessAgg(Query fgQ, Query bgQ) {
super(NAME);
// NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null
@ -87,7 +92,10 @@ public class RelatednessAgg extends AggValueSource {
public void setOpts(QParser parser) {
final boolean isShard = parser.getReq().getParams().getBool(ShardParams.IS_SHARD, false);
SolrParams opts = parser.getLocalParams();
if (null != opts) {
if (null == opts) {
this.useSweep = DEFAULT_SWEEP_COLLECTION;
} else {
this.useSweep = opts.getBool(SWEEP_COLLECTION, DEFAULT_SWEEP_COLLECTION);
if (!isShard) { // ignore min_pop if this is a shard request
this.min_pop = opts.getDouble("min_popularity", 0.0D);
}
@ -97,7 +105,7 @@ public class RelatednessAgg extends AggValueSource {
@Override
public String description() {
// TODO: need better output processing when we start supporting null fgQ/bgQ in constructor
return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop+")";
return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop + ",useSweep="+useSweep+")";
}
@Override
@ -163,9 +171,127 @@ public class RelatednessAgg extends AggValueSource {
return new Merger(this);
}
private static final class SweepSKGSlotAcc extends SlotAcc {
private final int minCount; // pre-calculate for a given min_popularity
private final long fgSize;
private final long bgSize;
private final ReadOnlyCountSlotAcc fgCount;
private final ReadOnlyCountSlotAcc bgCount;
private double[] relatedness;
private static final int NO_ALL_BUCKETS = -2;
private static final int ALL_BUCKETS_UNINITIALIZED = -1;
// we can't get the allBuckets info from the slotContext in collect(), b/c the whole point of
// sweep collection is that the "collect" methods aren't called.
// So this is the compromise: note in construction either that we're using a processor w/NO_ALL_BUCKETS
// or that we don't know the bucket yet (ALL_BUCKETS_UNINITIALIZED) and fill it in in getValues
// where we can check against the processor
private int allBucketsSlot;
public SweepSKGSlotAcc(double minPopularity, FacetContext fcontext, int numSlots, long fgSize, long bgSize, ReadOnlyCountSlotAcc fgCount, ReadOnlyCountSlotAcc bgCount) {
super(fcontext);
this.minCount = (int) Math.ceil(minPopularity * bgSize);
this.fgSize = fgSize;
this.bgSize = bgSize;
this.fgCount = fgCount;
this.bgCount = bgCount;
relatedness = new double[numSlots];
Arrays.fill(relatedness, 0, numSlots, Double.NaN);
// any processor that can (currently) result in the use of SweepSKGSlotAcc *should* be a
// FacetFieldProcessor -- but don't assume that will always be true...
this.allBucketsSlot = NO_ALL_BUCKETS;
if (fcontext.processor instanceof FacetFieldProcessor
// NOTE: if this instanceof/cast changes, getValues needs updated as well
&& ((FacetFieldProcessor)fcontext.processor).freq.allBuckets) {
this.allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
}
}
@Override
public void collect(int perSegDocId, int slot, IntFunction<SlotContext> slotContext) throws IOException {
throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping");
}
@Override
public int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping");
}
private double getRelatedness(int slot) {
final double cachedRelatedness = relatedness[slot];
if (Double.isNaN(cachedRelatedness)) {
final long fg_count = fgCount.getCount(slot);
final long bg_count = bgCount.getCount(slot);
if (minCount > 0) {
// if min_pop is configured, and either (fg|bg) popularity is lower then that value
// then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
if (fg_count < minCount || bg_count < minCount) {
return relatedness[slot] = Double.NEGATIVE_INFINITY;
}
}
return relatedness[slot] = computeRelatedness(fg_count, fgSize, bg_count, bgSize);
} else {
return cachedRelatedness;
}
}
public int compare(int slotA, int slotB) {
int r = Double.compare(getRelatedness(slotA), getRelatedness(slotB));
if (0 == r) {
r = Long.compare(fgCount.getCount(slotA), fgCount.getCount(slotB));
}
if (0 == r) {
r = Long.compare(bgCount.getCount(slotA), bgCount.getCount(slotB));
}
return r;
}
@Override
public Object getValue(int slotNum) {
final BucketData slotVal;
if (NO_ALL_BUCKETS != allBucketsSlot) {
// there's no reason why a processor should be resizing SlotAccs in the middle of getValue,
// but we're going to be vigilent against that possibility just in case...
if (ALL_BUCKETS_UNINITIALIZED == allBucketsSlot
|| allBucketsSlot == slotNum) {
assert fcontext.processor instanceof FacetFieldProcessor
: "code changed, non FacetFieldProcessor sweeping w/allBuckets?!?";
allBucketsSlot = ((FacetFieldProcessor)fcontext.processor).allBucketsAcc.collectAccSlot;
}
}
if (slotNum == allBucketsSlot) {
slotVal = new BucketData(null);
} else {
slotVal = new BucketData(fgCount.getCount(slotNum), fgSize, bgCount.getCount(slotNum), bgSize, getRelatedness(slotNum));
}
return slotVal.externalize(fcontext.isShard());
}
@Override
public void reset() throws IOException {
Arrays.fill(relatedness, Double.NaN);
if (allBucketsSlot != NO_ALL_BUCKETS) {
allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
}
}
@Override
public void resize(Resizer resizer) {
relatedness = resizer.resize(relatedness, Double.NaN);
}
@Override
public void close() throws IOException {
relatedness = null;
}
}
private static final String IMPLIED_KEY = "implied";
private static final class SKGSlotAcc extends SlotAcc {
private static final class SKGSlotAcc extends SlotAcc implements SweepableSlotAcc<SlotAcc> {
private final RelatednessAgg agg;
private BucketData[] slotvalues;
private final DocSet fgSet;
@ -181,9 +307,30 @@ public class RelatednessAgg extends AggValueSource {
// cache the set sizes for frequent re-use on every slot
this.fgSize = fgSet.size();
this.bgSize = bgSet.size();
this.slotvalues = new BucketData[numSlots];
this.slotvalues = new BucketData[numSlots]; //TODO: avoid initializing array until we know we're not doing sweep collection?
reset();
}
/**
* If called, may register SweepingAccs for fg and bg set based on whether
* user indicated sweeping should be used (default)
*
* @returns null if any SweepingAccs were registered since no other collection is needed for relatedness
*/
@Override
public SKGSlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
if (!this.agg.useSweep) {
return this;
} else {
final ReadOnlyCountSlotAcc fgCount = baseSweepingAcc.add(key + "!fg", fgSet, slotvalues.length);
final ReadOnlyCountSlotAcc bgCount = baseSweepingAcc.add(key + "!bg", bgSet, slotvalues.length);
SweepSKGSlotAcc readOnlyReplacement = new SweepSKGSlotAcc(agg.min_pop, fcontext, slotvalues.length, fgSize, bgSize, fgCount, bgCount);
readOnlyReplacement.key = key;
baseSweepingAcc.registerMapping(this, readOnlyReplacement);
return null;
}
}
private void processSlot(int slot, IntFunction<SlotContext> slotContext) throws IOException {
assert null != slotContext;
@ -213,7 +360,12 @@ public class RelatednessAgg extends AggValueSource {
assert null == fcontext.filter;
}
// ...and in which case we should just use the current base
final DocSet slotSet = null == slotQ ? fcontext.base : fcontext.searcher.getDocSet(slotQ);
final DocSet slotSet;
if (null == slotQ) {
slotSet = fcontext.base;
} else {
slotSet = fcontext.searcher.getDocSet(slotQ);
}
slotVal.incSizes(fgSize, bgSize);
slotVal.incCounts(fgSet.intersectionSize(slotSet),
@ -334,6 +486,16 @@ public class RelatednessAgg extends AggValueSource {
this.implied = true;
}
public BucketData(long fg_count, long fg_size, long bg_count, long bg_size, double relatedness) {
this.fg_count = fg_count;
this.fg_size = fg_size;
this.fg_pop = roundTo5Digits((double) fg_count / bg_size); // yes, BACKGROUND size is intentional
this.bg_count = bg_count;
this.bg_size = bg_size;
this.bg_pop = roundTo5Digits((double) bg_count / bg_size);
this.relatedness = relatedness;
}
/**
* Increment both the foreground &amp; background <em>counts</em> for the current bucket, reseting any
* derived values that may be cached

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
final class SingletonDISI extends SweepDISI {
private final DocIdSetIterator backing;
private final boolean isBase;
SingletonDISI(DocIdSetIterator backing, CountSlotAcc[] countAccs, boolean isBase) {
super(1, countAccs);
this.backing = backing;
this.isBase = isBase;
}
@Override
public int nextDoc() throws IOException {
return backing.nextDoc();
}
@Override
public boolean collectBase() {
return isBase;
}
@Override
public int registerCounts(SegCounter segCounter) {
return 0;
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import org.apache.solr.search.DocIterator;
final class SingletonDocIterator extends SweepDocIterator {
private final DocIterator backing;
private final boolean isBase;
SingletonDocIterator(DocIterator backing, boolean isBase) {
super(1);
this.backing = backing;
this.isBase = isBase;
}
@Override
public boolean hasNext() {
return backing.hasNext();
}
@Override
public int nextDoc() {
return backing.nextDoc();
}
@Override
public boolean collectBase() {
return isBase;
}
@Override
public int registerCounts(SegCounter segCounts) {
return 0;
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.function.IntFunction;
@ -52,6 +53,8 @@ public abstract class SlotAcc implements Closeable {
this.fcontext = fcontext;
}
@Override public String toString() { return key; }
/**
* NOTE: this currently detects when it is being reused and calls resetIterators by comparing reader ords
* with previous calls to setNextReader. For this reason, current users must call setNextReader
@ -597,9 +600,222 @@ public abstract class SlotAcc implements Closeable {
}
}
abstract static class CountSlotAcc extends SlotAcc {
/**
* Implemented by some SlotAccs if they are capable of being used for
* sweep collecting in compatible facet processors
* @see FacetFieldProcessor#registerSweepingAccIfSupportedByCollectAcc()
*/
static interface SweepableSlotAcc<T extends SlotAcc> {
/**
* Called by processors if they support sweeping. Implementations will often
* return self or null (the latter indicating that all necessary collection will
* be covered by the "sweeping" data structures registered with the specified
* baseSweepingAcc as a result of the call to this method).
*
* If an implementing instance chooses to replace itself with another {@link SlotAcc}, it must
* call {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)} on the specified
* baseSweepingAcc to notify it of the mapping from original SlotAcc to the SlotAcc that should
* be used for purposes of read access. It is the responsibility of the specified {@link SweepingCountSlotAcc}
* to ensure proper placement/accessibility of the SlotAcc to be used for read access.
*
* The replacement SlotAcc registered via {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)}
* will be responsible for output via its {@link SlotAcc#setValues(SimpleOrderedMap, int)} method.
* An implementer of this method may register such a replacement, and also return a non-null
* SlotAcc to be used for normal collection (via {@link FacetFieldProcessor#collectAcc}). In this case,
* the implementer should take care that the returned {@link SlotAcc} is different from the {@link SlotAcc}
* registered for the purpose of output -- with the former overriding {@link SlotAcc#setValues(SimpleOrderedMap, int)}
* as a no-op, to prevent setting duplicate values.
*
* @param baseSweepingAcc - never null, where the SlotAcc may register domains for sweep collection,
* and must register mappings of new read-access SlotAccs that result from this call.
* @return SlotAcc to be used for purpose of collection. If null then collect methods will
* never be called on this SlotAcc.
*/
public T registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc);
}
/**
* A simple data structure to {@link DocSet} domains with an associated {@link CountSlotAcc}. This may be used
* to support sweep count accumulation over different {@link DocSet} domains, but the concept is perfectly applicable
* to encapsulating the relevant state for simple "non-sweep" collection as well (in which case {@link SweepCountAccStruct#docSet}
* would be {@link FacetContext#base}, {@link SweepCountAccStruct#countAcc} would be {@link FacetProcessor#countAcc}, and
* {@link SweepCountAccStruct#isBase} would trivially be "true").
*/
static final class SweepCountAccStruct {
final DocSet docSet;
final boolean isBase;
final CountSlotAcc countAcc;
public SweepCountAccStruct(DocSet docSet, boolean isBase, CountSlotAcc countAcc) {
this.docSet = docSet;
this.isBase = isBase;
this.countAcc = countAcc;
}
public SweepCountAccStruct(SweepCountAccStruct t, DocSet replaceDocSet) {
this.docSet = replaceDocSet;
this.isBase = t.isBase;
this.countAcc = t.countAcc;
}
/**
* Because sweep collection offloads "collect" methods to count accumulation code,
* it is helpful to provide a read-only view over the backing {@link CountSlotAcc}
*
* @return - a read-only view over {@link #countAcc}
*/
public ReadOnlyCountSlotAcc roCountAcc() {
return countAcc;
}
@Override public String toString() {
return this.countAcc.toString();
}
}
/**
* Special CountSlotAcc used by processors that support sweeping to decide what to sweep over and how to "collect"
* when doing the sweep.
*
* This class may be used by instances of {@link SweepableSlotAcc} to register DocSet domains (via {@link SweepingCountSlotAcc#add})
* over which to sweep-collect facet counts.
*
* @see SweepableSlotAcc#registerSweepingAccs
*/
static class SweepingCountSlotAcc extends CountSlotArrAcc {
static final String SWEEP_COLLECTION_DEBUG_KEY = "sweep_collection";
private final SimpleOrderedMap<Object> debug;
private final FacetFieldProcessor p;
final SweepCountAccStruct base;
final List<SweepCountAccStruct> others = new ArrayList<>();
private final List<SlotAcc> output = new ArrayList<>();
SweepingCountSlotAcc(int numSlots, FacetFieldProcessor p) {
super(p.fcontext, numSlots);
this.p = p;
this.base = new SweepCountAccStruct(fcontext.base, true, this);
final FacetDebugInfo fdebug = fcontext.getDebugInfo();
this.debug = null != fdebug ? new SimpleOrderedMap<>() : null;
if (null != this.debug) {
fdebug.putInfoItem(SWEEP_COLLECTION_DEBUG_KEY, debug);
debug.add("base", key);
debug.add("accs", new ArrayList<String>());
debug.add("mapped", new ArrayList<String>());
}
}
/**
* Called by SweepableSlotAccs to register new DocSet domains for sweep collection
*
* @param key
* assigned to the returned SlotAcc, and used for debugging
* @param docs
* the domain over which to sweep
* @param numSlots
* the number of slots
* @return a read-only representation of the count acc which is guaranteed to be populated after sweep count
* collection
*/
public ReadOnlyCountSlotAcc add(String key, DocSet docs, int numSlots) {
final CountSlotAcc count = new CountSlotArrAcc(fcontext, numSlots);
count.key = key;
final SweepCountAccStruct ret = new SweepCountAccStruct(docs, false, count);
if (null != debug) {
@SuppressWarnings("unchecked")
List<String> accsDebug = (List<String>) debug.get("accs");
accsDebug.add(ret.toString());
}
others.add(ret);
return ret.roCountAcc();
}
/**
* When a {@link SweepableSlotAcc} replaces itself (for the purpose of collection) with a different {@link SlotAcc}
* instance, it must register that replacement by calling this method with itself as the fromAcc param, and with the
* new replacement {@link SlotAcc} as the toAcc param. The two SlotAccs must have the same {@link SlotAcc#key}.
*
* It is the responsibility of this method to insure that {@link FacetFieldProcessor} references to fromAcc (other than
* those within {@link FacetFieldProcessor#collectAcc}, which are set directly by the return value of
* {@link SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc)}) are replaced
* by references to toAcc. Such references would include, e.g., {@link FacetFieldProcessor#sortAcc}.
*
* It is also this method's responsibility to insure that read access to toAcc (via toAcc's {@link SlotAcc#setValues(SimpleOrderedMap, int)}
* method) is provided via this instance's {@link #setValues(SimpleOrderedMap, int)} method.
*
* @param fromAcc - the {@link SlotAcc} to be replaced (this will normally be the caller of this method).
* @param toAcc - the replacement {@link SlotAcc}
*
* @see SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc)
*/
public void registerMapping(SlotAcc fromAcc, SlotAcc toAcc) {
assert fromAcc.key.equals(toAcc.key);
output.add(toAcc);
if (p.sortAcc == fromAcc) {
p.sortAcc = toAcc;
}
if (null != debug) {
@SuppressWarnings("unchecked")
List<String> mappedDebug = (List<String>) debug.get("mapped");
mappedDebug.add(fromAcc.toString());
}
}
/**
* Always populates the bucket with the current count for that slot. If the count is positive, or if
* <code>processEmpty==true</code>, then this method also populates the values from mapped "output" accumulators.
*
* @see #setSweepValues
*/
@Override
public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
super.setValues(bucket, slotNum);
if (0 < getCount(slotNum) || fcontext.processor.freq.processEmpty) {
setSweepValues(bucket, slotNum);
}
}
/**
* Populates the bucket with the values from all mapped "output" accumulators for the specified slot.
*
* This method exists because there are some contexts (namely SpecialSlotAcc, for allBuckets, etc.) in which "base"
* count is tracked differently, via getSpecialCount(). For such cases, we need a method that allows the caller to
* directly coordinate calling {@link SlotAcc#setValues} on the sweeping output accs, while avoiding the inclusion
* of {@link CountSlotAcc#setValues CountSlotAcc.setValues}
*/
public void setSweepValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
for (SlotAcc acc : output) {
acc.setValues(bucket, slotNum);
}
}
/**
* Helper method for code that wants to operating in a sweeping manner even if the current processor
* is not using sweeping.
*
* @returns struct that wraps the {@link FacetContext#base} unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc}
*/
public static SweepCountAccStruct baseStructOf(FacetProcessor<?> processor) {
if (processor.countAcc instanceof SweepingCountSlotAcc) {
return ((SweepingCountSlotAcc) processor.countAcc).base;
}
return new SweepCountAccStruct(processor.fcontext.base, true, processor.countAcc);
}
/**
* Helper method for code that wants to operating in a sweeping manner even if the current processor
* is not using sweeping
*
* @returns empty list unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc}
*/
public static List<SweepCountAccStruct> otherStructsOf(FacetProcessor<?> processor) {
if (processor.countAcc instanceof SweepingCountSlotAcc) {
return ((SweepingCountSlotAcc) processor.countAcc).others;
}
return Collections.emptyList();
}
}
abstract static class CountSlotAcc extends SlotAcc implements ReadOnlyCountSlotAcc {
public CountSlotAcc(FacetContext fcontext) {
super(fcontext);
// assume we are the 'count' by default unless/untill our creator overrides this
this.key = "count";
}
public abstract void incrementCount(int slot, long count);

View File

@ -0,0 +1,187 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.LongValues;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
/**
* Implemented by extensions of doc iterators (i.e., {@link DocIdSetIterator}, {@link DocIterator} over one or
* more domain, to support facet count accumulation corresponding to each domain (and via {@link #collectBase()}
* to inform the necessity of "collection" for a single optional backing "base" set).
*/
interface SweepCountAware {
/**
* Returns true if one of the domains underlying this iterator is the "base" domain, and if that base domain
* contains the doc on which this iterator is currently positioned. If "true", then "collection" may be necessary
* for the current doc.
*
* For each iterator position (each doc), {@link #registerCounts(SegCounter)} must be called before this method.
*/
boolean collectBase();
/**
* Called on a positioned doc iterator to register array index mappings for domains that contain the current
* doc. Implementations register these mappings by calling {@link SegCounter#map(int, int)} on the specified
* segCounts param.
*
* For each iterator position, this method must be called before {@link #collectBase()}
*
* @param segCounts - to register mappings of array indices for domains that contain this doc
* @return - the max index of an array representing the domains that contain the current doc. If "n" domains
* contain the current doc, the return value would be "n - 1"
* @throws IOException - if thrown by advancing an underlying doc iterator
*/
int registerCounts(SegCounter segCounts) throws IOException;
/**
* Used to coordinate multiple count accumulations over multiple domains. Implementers will have "n" backing term-ord-indexed
* counts -- one for each domain over which count accumulation is to be performed. For each doc, count accumulation
* takes place in two phases, invoked by a "driver" (e.g., {@link FacetFieldProcessor}) that manages iteration over the
* union of doc domains:
*
* First, the driver passes this object as the param to {@link SweepCountAware#registerCounts(SegCounter)}, which
* calls {@link #map(int, int)} on "this" to map the static "allIdx" (allIdx < n) for each active backing domain to
* a transient "activeIdx" for counts corresponding to active domains (activeIdx < count(allIdx) <= n). (The return value
* of {@link SweepCountAware#registerCounts(SegCounter)} indicates to the "driver" the max "active counts" index (for
* domains that contain the current doc).
*
* The driver then calls {@link #incrementCount(int, int, int)}, passing the term ord, increment amount (usually "1"),
* and the max "active counts" index returned from {@link SweepCountAware#registerCounts(SegCounter)} in the first
* phase. The "max active counts index" param is used as the limit (inclusive) to iterate count accumulation over each
* of the "active" domains for the current doc.
*
* @see SweepCountAware#registerCounts(SegCounter)
*/
static interface SegCounter {
/**
* Mark/map a given domain/CountSlotAcc as active (eligible for count accumulation) for the current doc.
*
* @param allIdx - the static index of the domain/CountSlotAcc to be "activated" for the current doc
* @param activeIdx - the transient "active index" (for the purpose of actual count accumulation) to which to map
* the domain/CountSlotAcc indicated by "allIdx".
*/
void map(int allIdx, int activeIdx);
/**
* Increments counts for active domains/CountSlotAccs.
*
* @param ord - the term ord (either global ord per-seg) for which to increment counts
* @param inc - the amount by which to increment the count for the specified term ord
* @param maxIdx - the max index (inclusive) of active domains/CountSlotAccs to be incremented for the current doc
*/
void incrementCount(int ord, int inc, int maxIdx);
}
/**
* This class is designed to count over global term ords ({@link SegCountPerSeg} provides equivalent functionality for
* per-segment term ords).
*
* @see SegCountPerSeg
*/
static class SegCountGlobal implements SegCounter {
private final CountSlotAcc[] allCounts;
private final CountSlotAcc[] activeCounts;
public SegCountGlobal(CountSlotAcc[] allCounts) {
this.allCounts = allCounts;
this.activeCounts = Arrays.copyOf(allCounts, allCounts.length);
}
@Override
public void map(int allIdx, int activeIdx) {
activeCounts[activeIdx] = allCounts[allIdx];
}
@Override
public final void incrementCount(int globalOrd, int inc, int maxIdx) {
int i = maxIdx;
do {
activeCounts[i].incrementCount(globalOrd, inc);
} while (i-- > 0);
}
}
/**
* This class is designed to count over per-segment term ords ({@link SegCountGlobal} provides equivalent functionality for
* global term ords).
*
* @see SegCountGlobal
*/
static class SegCountPerSeg implements SegCounter {
protected final int[][] allSegCounts;
private final int[][] activeSegCounts;
private final boolean[] seen;
public SegCountPerSeg(int[][] allSegCounts, boolean[] seen, int segMax, int size) {
this.allSegCounts = allSegCounts;
this.activeSegCounts = Arrays.copyOf(this.allSegCounts, size);
this.seen = seen;
}
@Override
public final void map(int allIdx, int activeIdx) {
activeSegCounts[activeIdx] = allSegCounts[allIdx];
}
@Override
public final void incrementCount(int segOrd, int inc, int maxIdx) {
seen[segOrd] = true;
int i = maxIdx;
do {
activeSegCounts[i][segOrd] += inc;
} while (i-- > 0);
}
/**
* Maps accumulated per-segment term ords to global term ords and increments global slots on the specified countAccs
* accordingly. The index of each CountSlotAcc in the specified countAccs array must correspond to the
* the static index of its associated count accumulation doc domain and per-seg count array.
*
* @param countAccs - global-scope CountSlotAccs (one for each domain) to be incremented for the most recently accumulated
* segment
* @param toGlobal - mapping of per-segment term ords to global term ords for the most recently accumulated segment
* @param maxSegOrd - the max per-seg term ord for the most recently accumulated segment
*/
public void register(CountSlotAcc[] countAccs, LongValues toGlobal, int maxSegOrd) {
int segOrd = maxSegOrd;
final int maxIdx = countAccs.length - 1;
for (;;) {
if (seen[segOrd]) {
int i = maxIdx;
int slot = toGlobal == null ? segOrd : (int)toGlobal.get(segOrd);
do {
final int inc = allSegCounts[i][segOrd];
if (inc > 0) {
countAccs[i].incrementCount(slot, inc);
}
} while (i-- > 0);
}
if (--segOrd < 0) {
break;
}
}
}
}
}

View File

@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
public abstract class SweepDISI extends DocIdSetIterator implements SweepCountAware {
public final int size;
final CountSlotAcc[] countAccs;
public SweepDISI(int size, CountSlotAcc[] countAccs) {
this.size = size;
this.countAccs = countAccs;
}
private static boolean addAcc(SweepCountAccStruct entry, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx, int idx) throws IOException {
final DocIdSet docIdSet = entry.docSet.getTopFilter().getDocIdSet(subCtx, null);
if (docIdSet == null || (subIterators[idx] = docIdSet.iterator()) == null) {
return false;
}
activeCountAccs[idx] = entry.countAcc;
return true;
}
static SweepDISI newInstance(SweepCountAccStruct base, List<SweepCountAccStruct> others, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx) throws IOException {
int activeCt = 0;
final int baseIdx;
if (base == null || !addAcc(base, subIterators, activeCountAccs, subCtx, activeCt)) {
baseIdx = -1;
} else {
baseIdx = activeCt++;
}
for (SweepCountAccStruct entry : others) {
if (addAcc(entry, subIterators, activeCountAccs, subCtx, activeCt)) {
activeCt++;
}
}
switch (activeCt) {
case 0:
return null;
case 1:
return new SingletonDISI(subIterators[0], activeCountAccs, baseIdx >= 0); // solr docsets already exclude any deleted docs
default:
return new UnionDISI(subIterators, activeCountAccs, activeCt, baseIdx);
}
}
@Override
public int docID() {
throw new UnsupportedOperationException("Not supported.");
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException("Not supported.");
}
@Override
public long cost() {
throw new UnsupportedOperationException("Not supported.");
}
}

View File

@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
abstract class SweepDocIterator implements DocIterator, SweepCountAware {
public final int size;
public SweepDocIterator(int size) {
this.size = size;
}
static class SweepIteratorAndCounts {
final SweepDocIterator iter;
final CountSlotAcc[] countAccs;
public SweepIteratorAndCounts(SweepDocIterator iter, CountSlotAcc[] countAccs) {
this.iter = iter;
this.countAccs = countAccs;
}
}
static SweepIteratorAndCounts newInstance(SweepCountAccStruct base, List<SweepCountAccStruct> others) throws IOException {
final int activeCt;
SweepCountAccStruct entry;
if (base == null) {
activeCt = others.size();
entry = others.get(0);
} else {
activeCt = others.size() + 1;
entry = base;
}
if (activeCt == 1) {
final CountSlotAcc[] countAccs = new CountSlotAcc[] {entry.countAcc};
return new SweepIteratorAndCounts(new SingletonDocIterator(entry.docSet.iterator(), base != null), countAccs);
} else {
final DocIterator[] subIterators = new DocIterator[activeCt];
final CountSlotAcc[] countAccs = new CountSlotAcc[activeCt];
Iterator<SweepCountAccStruct> othersIter = others.iterator();
int i = 0;
for (;;) {
subIterators[i] = entry.docSet.iterator();
countAccs[i] = entry.countAcc;
if (++i == activeCt) {
break;
}
entry = othersIter.next();
}
return new SweepIteratorAndCounts(new UnionDocIterator(subIterators, base == null ? -1 : 0), countAccs);
}
}
@Override
public float score() {
throw new UnsupportedOperationException("Not supported.");
}
@Override
public Integer next() {
throw new UnsupportedOperationException("Not supported.");
}
@Override
public abstract int registerCounts(SegCounter segCounts); // override to not throw IOException
}

View File

@ -41,11 +41,15 @@ import org.apache.solr.index.SlowCompositeReaderWrapper;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts;
import org.apache.solr.uninverting.DocTermOrds;
import org.apache.solr.util.TestInjection;
import org.slf4j.Logger;
@ -315,7 +319,7 @@ public class UnInvertedField extends DocTermOrds {
private void getCounts(FacetFieldProcessorByArrayUIF processor, SlotAcc.CountSlotAcc counts) throws IOException {
private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException {
DocSet docs = processor.fcontext.base;
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
@ -325,9 +329,12 @@ public class UnInvertedField extends DocTermOrds {
return;
}
SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
final int[] index = this.index;
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet;
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet && baseCountAccStruct != null;
if (doNegative) {
FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
@ -337,21 +344,34 @@ public class UnInvertedField extends DocTermOrds {
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs);
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
// TODO: counts could be deferred if sorting by index order
counts.incrementCount(tt.termNum, searcher.numDocs(tt.termQuery, docs));
final int termOrd = tt.termNum;
Iterator<SweepCountAccStruct> othersIter = others.iterator();
SweepCountAccStruct entry = baseCountAccStruct != null ? baseCountAccStruct : othersIter.next();
for (;;) {
entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet));
if (!othersIter.hasNext()) {
break;
}
entry = othersIter.next();
}
}
// TODO: we could short-circuit counting altogether for sorted faceting
// where we already have enough terms from the bigTerms
if (termInstances > 0) {
DocIterator iter = docs.iterator();
final SweepIteratorAndCounts iterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
final SweepDocIterator iter = iterAndCounts.iter;
final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs);
while (iter.hasNext()) {
int doc = iter.nextDoc();
int maxIdx = iter.registerCounts(counts);
int code = index[doc];
if ((code & 0x80000000)!=0) {
@ -368,7 +388,7 @@ public class UnInvertedField extends DocTermOrds {
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
counts.incrementCount(tnum,1);
counts.incrementCount(tnum, 1, maxIdx);
}
} else {
int tnum = 0;
@ -378,7 +398,7 @@ public class UnInvertedField extends DocTermOrds {
if ((code & 0x80) == 0) {
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
counts.incrementCount(tnum,1);
counts.incrementCount(tnum, 1, maxIdx);
delta = 0;
}
code >>>= 8;
@ -388,9 +408,10 @@ public class UnInvertedField extends DocTermOrds {
}
if (doNegative) {
final CountSlotAcc baseCounts = processor.countAcc;
for (int i=0; i<numTermsInField; i++) {
// counts[i] = maxTermCounts[i] - counts[i];
counts.incrementCount(i, maxTermCounts[i] - (int) counts.getCount(i)*2);
baseCounts.incrementCount(i, maxTermCounts[i] - (int) baseCounts.getCount(i)*2);
}
}
@ -409,7 +430,7 @@ public class UnInvertedField extends DocTermOrds {
public void collectDocs(FacetFieldProcessorByArrayUIF processor) throws IOException {
if (processor.collectAcc==null && processor.allBucketsAcc == null && processor.startTermIndex == 0 && processor.endTermIndex >= numTermsInField) {
getCounts(processor, processor.countAcc);
getCounts(processor);
return;
}
@ -427,15 +448,22 @@ public class UnInvertedField extends DocTermOrds {
DocSet docs = processor.fcontext.base;
int uniqueTerms = 0;
final SlotAcc.CountSlotAcc countAcc = processor.countAcc;
final CountSlotAcc countAcc = processor.countAcc;
final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
for (TopTerm tt : bigTerms.values()) {
if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
// handle the biggest terms
DocSet intersection = searcher.getDocSet(tt.termQuery, docs);
DocSet termSet = searcher.getDocSet(tt.termQuery);
DocSet intersection = termSet.intersection(docs);
int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex,
slotNum -> { return new SlotContext(tt.termQuery); });
countAcc.incrementCount(tt.termNum - startTermIndex, collected);
final int termOrd = tt.termNum - startTermIndex;
countAcc.incrementCount(termOrd, collected);
for (SweepCountAccStruct entry : others) {
entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet));
}
if (collected > 0) {
uniqueTerms++;
}
@ -455,9 +483,14 @@ public class UnInvertedField extends DocTermOrds {
// TODO: handle facet.prefix here!!!
DocIterator iter = docs.iterator();
SweepIteratorAndCounts sweepIterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
final SweepDocIterator iter = sweepIterAndCounts.iter;
final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs;
final SegCountGlobal counts = new SegCountGlobal(countAccs);
while (iter.hasNext()) {
int doc = iter.nextDoc();
int maxIdx = iter.registerCounts(counts);
boolean collectBase = iter.collectBase();
if (doc >= adjustedMax) {
do {
@ -495,8 +528,10 @@ public class UnInvertedField extends DocTermOrds {
int arrIdx = tnum - startTermIndex;
if (arrIdx < 0) continue;
if (arrIdx >= nTerms) break;
countAcc.incrementCount(arrIdx, 1);
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
counts.incrementCount(arrIdx, 1, maxIdx);
if (collectBase) {
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
}
}
} else {
int tnum = 0;
@ -509,8 +544,10 @@ public class UnInvertedField extends DocTermOrds {
int arrIdx = tnum - startTermIndex;
if (arrIdx >= 0) {
if (arrIdx >= nTerms) break;
countAcc.incrementCount(arrIdx, 1);
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
counts.incrementCount(arrIdx, 1, maxIdx);
if (collectBase) {
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
}
}
delta = 0;
}

View File

@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.PriorityQueue;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
final class UnionDISI extends SweepDISI {
final int maxIdx;
private final SubIterStruct baseSub;
private boolean collectBase;
private final PriorityQueue<SubIterStruct> queue;
private SubIterStruct top;
private int docId = -1;
private static final class SubIterStruct {
private final DocIdSetIterator sub;
private final int index;
private int docId;
public SubIterStruct(DocIdSetIterator sub, int index) throws IOException {
this.sub = sub;
this.index = index;
nextDoc();
}
public void nextDoc() throws IOException {
docId = sub.nextDoc();
}
}
UnionDISI(DocIdSetIterator[] subIterators, CountSlotAcc[] countAccs, int size, int baseIdx) throws IOException {
super(size, countAccs);
this.maxIdx = size - 1;
queue = new PriorityQueue<SubIterStruct>(size) {
@Override
protected boolean lessThan(SubIterStruct a, SubIterStruct b) {
return a.docId < b.docId;
}
};
int i = maxIdx;
SubIterStruct tmpBaseSub = null;
do {
final SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i);
queue.add(subIterStruct);
if (i == baseIdx) {
tmpBaseSub = subIterStruct;
}
} while (i-- > 0);
baseSub = tmpBaseSub;
top = queue.top();
}
@Override
public int nextDoc() throws IOException {
if (top.docId == docId) {
do {
top.nextDoc();
} while ((top = queue.updateTop()).docId == docId);
}
if (baseSub != null) {
collectBase = false;
}
return docId = top.docId;
}
@Override
public boolean collectBase() {
assert top.docId != docId : "must call registerCounts() before collectBase()";
return collectBase;
}
@Override
public int registerCounts(SegCounter segCounter) throws IOException {
int i = -1;
do {
if (!collectBase && top == baseSub) {
collectBase = true;
}
segCounter.map(top.index, ++i);
top.nextDoc();
} while ((top = queue.updateTop()).docId == docId);
return i;
}
}

View File

@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.solr.search.DocIterator;
import org.apache.lucene.util.PriorityQueue;
final class UnionDocIterator extends SweepDocIterator {
private final int maxIdx;
private final SubIterStruct baseSub;
private boolean collectBase;
private final PriorityQueue<SubIterStruct> queue;
private SubIterStruct top;
private int docId = -1;
private static final class SubIterStruct {
private final DocIterator sub;
private final int index;
private int docId;
public SubIterStruct(DocIterator sub, int index) throws IOException {
this.sub = sub;
this.index = index;
nextDoc();
}
public void nextDoc() {
docId = sub.hasNext() ? sub.nextDoc() : DocIdSetIterator.NO_MORE_DOCS;
}
}
UnionDocIterator(DocIterator[] subIterators, int baseIdx) throws IOException {
super(subIterators.length);
this.maxIdx = size - 1;
queue = new PriorityQueue<SubIterStruct>(size) {
@Override
protected boolean lessThan(SubIterStruct a, SubIterStruct b) {
return a.docId < b.docId;
}
};
SubIterStruct tmpBase = null;
int i = maxIdx;
do {
SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i);
queue.add(subIterStruct);
if (i == baseIdx) {
tmpBase = subIterStruct;
}
} while (i-- > 0);
this.baseSub = tmpBase;
top = queue.top();
}
@Override
public int nextDoc() {
if (top.docId == docId) {
do {
top.nextDoc();
} while ((top = queue.updateTop()).docId == docId);
}
collectBase = false;
return docId = top.docId;
}
@Override
public boolean hasNext() {
if (top.docId == docId) {
do {
top.nextDoc();
} while ((top = queue.updateTop()).docId == docId);
}
return top.docId != DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public boolean collectBase() {
assert top.docId != docId : "must call registerCounts() before collectBase()";
return collectBase;
}
@Override
public int registerCounts(SegCounter segCounts) {
int i = -1;
do {
if (!collectBase && top == baseSub) {
collectBase = true;
}
segCounts.map(top.index, ++i);
top.nextDoc();
} while ((top = queue.updateTop()).docId == docId);
return i;
}
}

View File

@ -48,6 +48,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import static org.apache.solr.search.facet.FacetField.FacetMethod;
import static org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc.SWEEP_COLLECTION_DEBUG_KEY;
import org.noggit.JSONUtil;
import org.noggit.JSONWriter;
@ -58,12 +59,11 @@ import org.junit.BeforeClass;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* <p>
* A randomized test of nested facets using the <code>relatedness()</code> function, that asserts the
* results are consistent and equivilent regardless of what <code>method</code> (ie: FacetFieldProcessor)
* is requested.
* and/or <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> option is requested.
* </p>
* <p>
* This test is based on {@link TestCloudJSONFacetSKG} but does <em>not</em>
@ -281,6 +281,212 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
}
}
/**
* Sanity check that our method of varying the <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> in conjunction with the
* <code>method</code> params works and can be verified by inspecting the debug output of basic requests.
*/
public void testWhiteboxSanitySweepDebug() throws Exception {
// NOTE: json.facet debugging output can be wonky, particularly when dealing with cloud
// so for these queries we keep it simple:
// - only one "top" facet per request
// - no refinement
// even with those constraints in place, a single facet can (may/sometimes?) produce multiple debug
// blocks - aparently due to shard merging? So...
// - only inspect the "first" debug NamedList in the results
//
final SolrParams baseParams = params("rows","0",
"debug","true", // SOLR-14451
// *:* is the only "safe" query for this test,
// to ensure we always have at least one bucket for every facet
// so we can be confident in getting the debug we expect...
"q", "*:*",
"fore", multiStrField(7)+":11",
"back", "*:*");
// simple individual facet that sorts on an skg stat...
//
// all results we test should be the same even if there is another 'skg_extra' stat,
// it shouldn't be involved in the sweeping at all.
for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
// choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
final TermFacet f = new TermFacet(soloStrField(9), 10, 0, "skg desc", null);
if (null != extra) {
f.subFacets.put("skg_extra", extra);
}
final Map<String,TermFacet> facets = new LinkedHashMap<>();
facets.put("str", f);
final SolrParams facetParams
= SolrParams.wrapDefaults(params("method_val", "dv",
"json.facet", Facet.toJSONFacetParamValue(facets)),
baseParams);
// both default sweep option and explicit sweep should give same results...
for (SolrParams sweepParams : Arrays.asList(params(),
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
"sweep_val", "true"))) {
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
final NamedList<Object> debug = getFacetDebug(params);
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
@SuppressWarnings("unchecked")
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
assertNotNull(sweep_debug);
assertEquals("count", sweep_debug.get("base"));
assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs"));
assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped"));
}
{ // 'dv' will always *try* to sweep, but disabling on stat should mean debug is mostly empty...
final SolrParams params = SolrParams.wrapDefaults(params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
"sweep_val", "false"),
facetParams);
final NamedList<Object> debug = getFacetDebug(params);
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
@SuppressWarnings("unchecked")
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
assertNotNull(sweep_debug);
assertEquals("count", sweep_debug.get("base"));
assertEquals(Collections.emptyList(), sweep_debug.get("accs"));
assertEquals(Collections.emptyList(), sweep_debug.get("mapped"));
}
{ // if we override 'dv' with 'hashdv' which doesn't sweep, our sweep debug should be empty,
// even if the skg stat does ask for sweeping explicitly...
final SolrParams params = SolrParams.wrapDefaults(params("method_val", "dvhash",
"sweep_key", RelatednessAgg.SWEEP_COLLECTION,
"sweep_val", "true"),
facetParams);
final NamedList<Object> debug = getFacetDebug(params);
assertEquals(FacetFieldProcessorByHashDV.class.getSimpleName(), debug.get("processor"));
assertNull(debug.get(SWEEP_COLLECTION_DEBUG_KEY));
}
}
// simple facet that sorts on an skg stat but uses prelim_sort on count
//
// all results we test should be the same even if there is another 'skg_extra' stat,
// neither skg should be involved in the sweeping at all.
for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
// choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
final TermFacet f = new TermFacet(soloStrField(9), map("limit", 3, "overrequest", 0,
"sort", "skg desc",
"prelim_sort", "count asc"));
if (null != extra) {
f.subFacets.put("skg_extra", extra);
}
final Map<String,TermFacet> facets = new LinkedHashMap<>();
facets.put("str", f);
final SolrParams facetParams
= SolrParams.wrapDefaults(params("method_val", "dv",
"json.facet", Facet.toJSONFacetParamValue(facets)),
baseParams);
// default sweep as well as any explicit sweep=true/false values should give same results: no sweeping
for (SolrParams sweepParams : Arrays.asList(params(),
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
"sweep_val", "false"),
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
"sweep_val", "true"))) {
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
final NamedList<Object> debug = getFacetDebug(params);
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
@SuppressWarnings("unchecked")
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
assertNotNull(sweep_debug);
assertEquals("count", sweep_debug.get("base"));
assertEquals(Collections.emptyList(), sweep_debug.get("accs"));
assertEquals(Collections.emptyList(), sweep_debug.get("mapped"));
}
}
{ // single facet with infinite limit + multiple skgs...
// this should trigger MultiAcc collection, causing sweeping on both skg functions
//
// all results we test should be the same even if there is another 'min' stat,
// in each term facet. it shouldn't affect the sweeping/MultiAcc at all.
for (Facet extra : Arrays.asList(null, new SumFacet(multiIntField(2)))) {
final Map<String,TermFacet> facets = new LinkedHashMap<>();
final TermFacet facet = new TermFacet(soloStrField(9), -1, 0, "skg2 desc", null);
facet.subFacets.put("skg2", new RelatednessFacet(multiStrField(2)+":9", null));
if (null != extra) {
facet.subFacets.put("sum", extra);
}
facets.put("str", facet);
final SolrParams facetParams
= SolrParams.wrapDefaults(params("method_val", "dv",
"json.facet", Facet.toJSONFacetParamValue(facets)),
baseParams);
// both default sweep option and explicit sweep should give same results...
for (SolrParams sweepParams : Arrays.asList(params(),
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
"sweep_val", "true"))) {
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
final NamedList<Object> debug = getFacetDebug(params);
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
@SuppressWarnings("unchecked")
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
assertNotNull(sweep_debug);
assertEquals("count", sweep_debug.get("base"));
assertEquals(Arrays.asList("skg!fg","skg!bg","skg2!fg","skg2!bg"), sweep_debug.get("accs"));
assertEquals(Arrays.asList("skg","skg2"), sweep_debug.get("mapped"));
}
}
}
// nested facets that both sort on an skg stat
// (set limit + overrequest tiny to keep multishard response managable)
//
// all results we test should be the same even if there is another 'skg_extra' stat,
// in each term facet. they shouldn't be involved in the sweeping at all.
for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
// choose single value strings so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
// choose 'id' for the parent facet so we are garunteed some child facets
final TermFacet parent = new TermFacet("id", 1, 0, "skg desc", false);
final TermFacet child = new TermFacet(soloStrField(7), 1, 0, "skg desc", false);
parent.subFacets.put("child", child);
if (null != extra) {
parent.subFacets.put("skg_extra", extra);
child.subFacets.put("skg_extra", extra);
}
final Map<String,TermFacet> facets = new LinkedHashMap<>();
facets.put("parent", parent);
final SolrParams facetParams
= SolrParams.wrapDefaults(params("method_val", "dv",
"json.facet", Facet.toJSONFacetParamValue(facets)),
baseParams);
// both default sweep option and explicit sweep should give same results...
for (SolrParams sweepParams : Arrays.asList(params(),
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
"sweep_val", "true"))) {
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
final NamedList<Object> parentDebug = getFacetDebug(params);
assertEquals("id", parentDebug.get("field"));
assertNotNull(parentDebug.get("sub-facet"));
// may be multiples from diff shards, just use first one
@SuppressWarnings("unchecked")
final NamedList<Object> childDebug = ((List<NamedList<Object>>)parentDebug.get("sub-facet")).get(0);
assertEquals(soloStrField(7), childDebug.get("field"));
// these should all be true for both the parent and the child debug..
for (NamedList<Object> debug : Arrays.asList(parentDebug, childDebug)) {
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
@SuppressWarnings("unchecked")
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
assertNotNull(sweep_debug);
assertEquals("count", sweep_debug.get("base"));
assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs"));
assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped"));
}
}
}
}
/**
* returns the <b>FIRST</b> NamedList (under the implicit 'null' FacetQuery) in the "facet-trace" output
* of the request. Should not be used with multiple "top level" facets
@ -358,7 +564,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
}
}
{ // multi-valued facet field w/infinite limit and an extra (non-SKG) stat
{ // multi-valued facet field w/infinite limit and an extra (non-SKG / non-sweeping) stat
final TermFacet xxx = new TermFacet(multiStrField(12), -1, 0, "count asc", false);
xxx.subFacets.put("sum", new SumFacet(multiIntField(4)));
final Map<String,TermFacet> facets = new LinkedHashMap<>();
@ -414,7 +620,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
for (int limit : Arrays.asList(10, -1)) {
for (String sort : Arrays.asList("count desc", "skg desc", "index asc")) {
for (Boolean refine : Arrays.asList(false, true)) {
{ // 1 additional (non-SKG) stat
{ // 1 additional (non-SKG / non-sweeping) stat
final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit,
"overrequest", 0,
"sort", sort,
@ -440,7 +646,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
multiStrField(0) + ":46"),
multiStrField(5)+":9", "*:*");
}
{ // multiple SKGs and a multiple non-SKG stats
{ // multiple SKGs and a multiple non-SKG / non-sweeping stats
final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit,
"overrequest", 0,
"sort", sort,
@ -504,6 +710,8 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
/**
* Given a set of term facets, and top level query strings, asserts that
* the results of these queries are identical even when varying the <code>method_val</code> param
* and when varying the <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> param; either by explicitly setting to
* <code>true</code> or <code>false</code> or by changing the param key to not set it at all.
*/
private void assertFacetSKGsAreConsistent(final Map<String,TermFacet> facets,
final String query,
@ -520,27 +728,33 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
@SuppressWarnings({"rawtypes"})
final NamedList expected = getFacetResponse(basicParams);
// now loop over all processors and compare them to the "default"...
// now loop over all permutations of processors and sweep values and and compare them to the "default"...
for (FacetMethod method : EnumSet.allOf(FacetMethod.class)) {
ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT));
for (Boolean sweep : Arrays.asList(true, false, null)) {
final ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT));
if (null != sweep) {
options.add("sweep_key", RelatednessAgg.SWEEP_COLLECTION);
options.add("sweep_val", sweep.toString());
}
@SuppressWarnings({"rawtypes"})
final NamedList actual = getFacetResponse(SolrParams.wrapAppended(options, basicParams));
@SuppressWarnings({"rawtypes"})
final NamedList actual = getFacetResponse(SolrParams.wrapAppended(options, basicParams));
// we can't rely on a trivial assertEquals() comparison...
//
// the order of the sub-facet keys can change between
// processors. (notably: method:enum vs method:smart when sort:"index asc")
//
// NOTE: this doesn't ignore the order of the buckets,
// it ignores the order of the keys in each bucket...
final String pathToMismatch = BaseDistributedSearchTestCase.compare
(expected, actual, 0,
Collections.singletonMap("buckets", BaseDistributedSearchTestCase.UNORDERED));
if (null != pathToMismatch) {
log.error("{}: expected = {}", options, expected);
log.error("{}: actual = {}", options, actual);
fail("Mismatch: " + pathToMismatch + " using " + options);
// we can't rely on a trivial assertEquals() comparison...
//
// the order of the sub-facet keys can change between
// processors. (notably: method:enum vs method:smart when sort:"index asc")
//
// NOTE: this doesn't ignore the order of the buckets,
// it ignores the order of the keys in each bucket...
final String pathToMismatch = BaseDistributedSearchTestCase.compare
(expected, actual, 0,
Collections.singletonMap("buckets", BaseDistributedSearchTestCase.UNORDERED));
if (null != pathToMismatch) {
log.error("{}: expected = {}", options, expected);
log.error("{}: actual = {}", options, actual);
fail("Mismatch: " + pathToMismatch + " using " + options);
}
}
}
} catch (AssertionError e) {
@ -617,6 +831,10 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
* unless they are 'null' in which case <code>$fore</code> and <code>$back</code> refs will be used
* in their place, and must be set as request params (this allows "random" facets to still easily
* trigger the "nested facets re-using the same fore/back set for SKG situation)
*
* The JSON for all of these facets includes a <code>${sweep_key:xxx}</code> (which will be ignored
* by default) and <code>${sweep_val:yyy}</code> which may be set as params on each request to override the
* implicit default sweeping behavior of the underlying SKGAcc.
*/
private static final class RelatednessFacet implements Facet, Writable {
public final Map<String,Object> jsonData = new LinkedHashMap<>();
@ -641,7 +859,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
// we don't allow these to be overridden by options, so set them now...
jsonData.put("type", "func");
jsonData.put("func", "relatedness("+f+","+b+")");
jsonData.put("${sweep_key:xxx}","${sweep_val:yyy}");
}
@Override
public void write(JSONWriter writer) {

View File

@ -763,6 +763,54 @@ public class TestJsonFacets extends SolrTestCaseHS {
+ " } } ] } }");
}
@Test
public void testSKGSweepMultiAcc() throws Exception {
Client client = Client.localClient();
indexSimple(client);
// simple single level facet w/skg & trivial non-sweeping stat using various sorts & (re)sorting
for (String sort : Arrays.asList("sort:'index asc'",
"sort:'y desc'",
"sort:'z desc'",
"sort:'skg desc'",
"prelim_sort:'count desc', sort:'index asc'",
"prelim_sort:'count desc', sort:'y desc'",
"prelim_sort:'count desc', sort:'z desc'",
"prelim_sort:'count desc', sort:'skg desc'")) {
// the relatedness score of each of our cat_s values is (conviniently) also alphabetical order,
// (and the same order as 'sum(num_i) desc' & 'min(num_i) desc')
//
// So all of these re/sort options should produce identical output
// - Testing "index" sort allows the randomized use of "stream" processor as default to be tested.
// - Testing (re)sorts on other stats sanity checks code paths where relatedness() is a "defered" Agg
for (String sweep : Arrays.asList("true", "false")) {
// results should be the same even if we disable sweeping...
assertJQ(req("q", "cat_s:[* TO *]", "rows", "0",
"fore", "where_s:NY", "back", "*:*",
"json.facet", ""
+ "{x: { type: terms, field: 'cat_s', "+sort+", limit:-1, "
+ " facet: { skg: { type: 'func', func:'relatedness($fore,$back)', "
+" "+RelatednessAgg.SWEEP_COLLECTION+": "+sweep+" },"
+ " y:'sum(num_i)', "
+" z:'min(num_i)' } } }")
, "facets=={count:5, x:{ buckets:["
+ " { val:'A', count:2, y:5.0, z:2, "
+ " skg : { relatedness: 0.00554, "
+ " foreground_popularity: 0.16667,"
+ " background_popularity: 0.33333, },"
+ " }, "
+ " { val:'B', count:3, y:-3.0, z:-5, "
+ " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
+ " foreground_popularity: 0.16667,"
+ " background_popularity: 0.5 },"
+ " } ] } } "
);
}
}
}
@Test
public void testRepeatedNumerics() throws Exception {
Client client = Client.localClient();

View File

@ -919,6 +919,10 @@ NOTE: While it's very common to define the Background Set as `\*:*`, or some oth
When using the extended `type:func` syntax for specifying a `relatedness()` aggregation, an optional `min_popularity` (float) option can be used to specify a lower bound on the `foreground_popularity` and `background_popularity` values, that must be met in order for the `relatedness` score to be valid -- If this `min_popularity` is not met, then the `relatedness` score will be `-Infinity`.
The default implementation for calculating `relatedness()` domain correlation depends on the type of facet being calculated. Generic domain correlation is calculated per-term, by selectively retrieving a DocSet for each bucket-associated query (consulting the `filterCache`) and calculating DocSet intersections with "foreground" and "background" sets. For term facets (especially over high-cardinality fields) this approach can lead to `filterCache` thrashing; accordingly, `relatedness()` over term facets defaults where possible to an approach that collects facet counts directly over all multiple domains in a single sweep (never touching the `filterCache`). It is possible to explicitly control this "single sweep" collection by setting the extended `type:func` syntax `sweep_collection` option to `true` (the default) or `false` (to disable sweep collection).
NOTE: Disabling sweep collection for `relatedness()` stats over low-cardinality fields may yield a performance benefit, provided the `filterCache` is sufficiently large to accommodate an entry for each value in the associated field without inducing thrashing for anticipated use patterns. A reasonable heuristic is that fields of cardinality less than 1,000 _may_ benefit from disabling sweep. This heuristic is _not_ used to determine default behavior, particularly because non-sweep collection can so easily induce `filterCache` thrashing, with system-wide detrimental effects.
[source,json]
----
{ "type": "func",