mirror of https://github.com/apache/lucene.git
SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()"
This adds a lot of "under the covers" improvements to how JSON Faceting FacetField processors work, to enable "sweeping" support when the SlotAcc used for sorting support it (currently just "relatedness()") This is a squash commit of all changes on https://github.com/magibney/lucene-solr/tree/SOLR-13132 Up to and including ca7a8e0b39840d00af9022c048346a7d84bf280d. Co-authored-by: Chris Hostetter <hossman@apache.org> Co-authored-by: Michael Gibney <michael@michaelgibney.net>
This commit is contained in:
parent
5a422db60e
commit
40e2122b5a
|
@ -129,6 +129,9 @@ Optimizations
|
|||
|
||||
* SOLR-14610: ReflectMapWriter to use MethodHandle instead of old reflection (noble)
|
||||
|
||||
* SOLR-13132: JSON Facet perf improvements to support "sweeping" collection of "relatedness()"
|
||||
(hossman, Michael Gibney)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
(No changes)
|
||||
|
|
|
@ -33,6 +33,7 @@ import java.util.function.IntFunction;
|
|||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
|
@ -40,6 +41,8 @@ import org.apache.solr.schema.FieldType;
|
|||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
|
||||
|
||||
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
|
||||
|
||||
|
@ -116,7 +119,6 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
// allow a custom count acc to be used
|
||||
if (countAcc == null) {
|
||||
countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount);
|
||||
countAcc.key = "count";
|
||||
}
|
||||
|
||||
if (accs != null) {
|
||||
|
@ -509,12 +511,12 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
/** Helper method used solely when looping over buckets to be returned in findTopSlots */
|
||||
private void fillBucketFromSlot(SimpleOrderedMap<Object> target, Slot slot,
|
||||
SlotAcc resortAcc) throws IOException {
|
||||
final long count = countAcc.getCount(slot.slot);
|
||||
target.add("count", count);
|
||||
if (count <= 0 && !freq.processEmpty) return;
|
||||
final int slotOrd = slot.slot;
|
||||
countAcc.setValues(target, slotOrd);
|
||||
if (countAcc.getCount(slotOrd) <= 0 && !freq.processEmpty) return;
|
||||
|
||||
if (collectAcc != null && slot.slot >= 0) {
|
||||
collectAcc.setValues(target, slot.slot);
|
||||
if (slotOrd >= 0 && collectAcc != null) {
|
||||
collectAcc.setValues(target, slotOrd);
|
||||
}
|
||||
|
||||
if (otherAccs == null && freq.subFacets.isEmpty()) return;
|
||||
|
@ -689,7 +691,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
}
|
||||
}
|
||||
|
||||
static class MultiAcc extends SlotAcc {
|
||||
static class MultiAcc extends SlotAcc implements SweepableSlotAcc<SlotAcc> {
|
||||
final SlotAcc[] subAccs;
|
||||
|
||||
MultiAcc(FacetContext fcontext, SlotAcc[] subAccs) {
|
||||
|
@ -741,6 +743,65 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
acc.setValues(bucket, slotNum);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
|
||||
final FacetFieldProcessor p = (FacetFieldProcessor) fcontext.processor;
|
||||
int j = 0;
|
||||
for (int i = 0; i < subAccs.length; i++) {
|
||||
final SlotAcc acc = subAccs[i];
|
||||
if (acc instanceof SweepableSlotAcc) {
|
||||
SlotAcc replacement = ((SweepableSlotAcc<?>)acc).registerSweepingAccs(baseSweepingAcc);
|
||||
if (replacement == null) {
|
||||
// drop acc, do not increment j
|
||||
continue;
|
||||
} else if (replacement != acc || j < i) {
|
||||
subAccs[j] = replacement;
|
||||
}
|
||||
} else if (j < i) {
|
||||
subAccs[j] = acc;
|
||||
}
|
||||
j++;
|
||||
}
|
||||
switch (j) {
|
||||
case 0:
|
||||
return null;
|
||||
case 1:
|
||||
return subAccs[0];
|
||||
default:
|
||||
if (j == subAccs.length) {
|
||||
return this;
|
||||
} else {
|
||||
// must resize final field subAccs
|
||||
return new MultiAcc(fcontext, ArrayUtil.copyOfSubArray(subAccs, 0, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method that subclasses can use to indicate they with to use sweeping.
|
||||
* If {@link #countAcc} and {@link #collectAcc} support sweeping, then this method will:
|
||||
* <ul>
|
||||
* <li>replace {@link #collectAcc} with it's sweeping equivalent</li>
|
||||
* <li>update {@link #allBucketsAcc}'s reference to {@link #collectAcc} (if it exists)</li>
|
||||
* </ul>
|
||||
*
|
||||
* @return true if the above actions were taken
|
||||
* @see SweepableSlotAcc
|
||||
* @see SweepingCountSlotAcc
|
||||
*/
|
||||
protected boolean registerSweepingAccIfSupportedByCollectAcc() {
|
||||
if (countAcc instanceof SweepingCountSlotAcc && collectAcc instanceof SweepableSlotAcc) {
|
||||
final SweepingCountSlotAcc sweepingCountAcc = (SweepingCountSlotAcc)countAcc;
|
||||
collectAcc = ((SweepableSlotAcc<?>)collectAcc).registerSweepingAccs(sweepingCountAcc);
|
||||
if (allBucketsAcc != null) {
|
||||
allBucketsAcc.collectAcc = collectAcc;
|
||||
allBucketsAcc.sweepingCountAcc = sweepingCountAcc;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static final SlotContext ALL_BUCKETS_SLOT_CONTEXT = new SlotContext(null) {
|
||||
|
@ -766,6 +827,7 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
int collectAccSlot;
|
||||
int otherAccsSlot;
|
||||
long count;
|
||||
SweepingCountSlotAcc sweepingCountAcc; // null unless/until sweeping is initialized
|
||||
|
||||
SpecialSlotAcc(FacetContext fcontext, SlotAcc collectAcc, int collectAccSlot, SlotAcc[] otherAccs, int otherAccsSlot) {
|
||||
super(fcontext);
|
||||
|
@ -822,6 +884,9 @@ abstract class FacetFieldProcessor extends FacetProcessor<FacetField> {
|
|||
|
||||
@Override
|
||||
public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
|
||||
if (sweepingCountAcc != null) {
|
||||
sweepingCountAcc.setSweepValues(bucket, collectAccSlot);
|
||||
}
|
||||
if (collectAcc != null) {
|
||||
collectAcc.setValues(bucket, collectAccSlot);
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
|
||||
|
||||
import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
|
||||
|
||||
|
@ -34,6 +35,9 @@ import static org.apache.solr.search.facet.FacetContext.SKIP_FACET;
|
|||
* Base class for DV/UIF accumulating counts into an array by ordinal. It's
|
||||
* for {@link org.apache.lucene.index.SortedDocValues} and {@link org.apache.lucene.index.SortedSetDocValues} only.
|
||||
* It can handle terms (strings), not numbers directly but those encoded as terms, and is multi-valued capable.
|
||||
* By default, this class assumes subclasses can support sweeping collection unless subclasses initialize <code>countAcc</code> directly in their constructors.
|
||||
*
|
||||
* @see SweepingCountSlotAcc
|
||||
*/
|
||||
abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
|
||||
BytesRefBuilder prefixRef;
|
||||
|
@ -56,6 +60,34 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
|
|||
/** this BytesRef may be shared across calls and should be deep-cloned if necessary */
|
||||
abstract protected BytesRef lookupOrd(int ord) throws IOException;
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* This impl first initializes <code>countAcc</code> as a {@link SweepingCountSlotAcc} if null.
|
||||
*/
|
||||
@Override
|
||||
protected void createAccs(long docCount, int slotCount) throws IOException {
|
||||
if (countAcc == null) {
|
||||
countAcc = new SweepingCountSlotAcc(slotCount, this);
|
||||
}
|
||||
super.createAccs(docCount, slotCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* This impl first initializes <code>countAcc</code> as a {@link SweepingCountSlotAcc} if null.
|
||||
*/
|
||||
@Override
|
||||
void createCollectAcc(int numDocs, int numSlots) throws IOException {
|
||||
if (countAcc == null) {
|
||||
countAcc = new SweepingCountSlotAcc(numSlots, this);
|
||||
}
|
||||
super.createCollectAcc(numDocs, numSlots);
|
||||
registerSweepingAccIfSupportedByCollectAcc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process() throws IOException {
|
||||
super.process();
|
||||
|
@ -87,8 +119,10 @@ abstract class FacetFieldProcessorByArray extends FacetFieldProcessor {
|
|||
if (freq.allBuckets) {
|
||||
// count is irrelevant, but hardcoded in collect(...), so intercept/mask normal counts.
|
||||
// Set here to prevent createAccs(...) from creating a 1-slot countAcc that will fail with AIOOBE
|
||||
// NOTE: because collectAcc will be null, it is fine/irrelevant to set a countAcc that doesn't support sweeping
|
||||
countAcc = SlotAcc.DEV_NULL_SLOT_ACC;
|
||||
createAccs(nDocs, 1);
|
||||
assert collectAcc == null;
|
||||
otherAccs = accs; // accs is created above and set on allBucketsAcc; but during collection, setNextReader is called on otherAccs.
|
||||
allBucketsAcc = new SpecialSlotAcc(fcontext, null, -1, accs, 0);
|
||||
collectDocs();
|
||||
|
|
|
@ -26,14 +26,17 @@ import org.apache.lucene.index.MultiDocValues;
|
|||
import org.apache.lucene.index.OrdinalMap;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.Filter;
|
||||
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
|
||||
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
|
||||
import org.apache.solr.search.facet.SweepCountAware.SegCountPerSeg;
|
||||
import org.apache.solr.uninverting.FieldCacheImpl;
|
||||
|
||||
/**
|
||||
|
@ -94,6 +97,10 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
return;
|
||||
}
|
||||
|
||||
final SweepCountAccStruct base = SweepingCountSlotAcc.baseStructOf(this);
|
||||
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(this);
|
||||
assert null != base;
|
||||
|
||||
// TODO: refactor some of this logic into a base class
|
||||
boolean countOnly = collectAcc==null && allBucketsAcc==null;
|
||||
boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
|
||||
|
@ -118,16 +125,21 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
|
||||
if (freq.perSeg != null) accumSeg = canDoPerSeg && freq.perSeg; // internal - override perSeg heuristic
|
||||
|
||||
final int maxSize = others.size() + 1; // others + base
|
||||
final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
|
||||
Filter filter = fcontext.base.getTopFilter();
|
||||
final DocIdSetIterator[] subIterators = new DocIdSetIterator[maxSize];
|
||||
final CountSlotAcc[] activeCountAccs = new CountSlotAcc[maxSize];
|
||||
|
||||
for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
|
||||
LeafReaderContext subCtx = leaves.get(subIdx);
|
||||
|
||||
setNextReaderFirstPhase(subCtx);
|
||||
|
||||
DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs
|
||||
DocIdSetIterator disi = dis.iterator();
|
||||
final SweepDISI disi = SweepDISI.newInstance(base, others, subIterators, activeCountAccs, subCtx);
|
||||
if (disi == null) {
|
||||
continue;
|
||||
}
|
||||
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
|
||||
|
||||
SortedDocValues singleDv = null;
|
||||
SortedSetDocValues multiDv = null;
|
||||
|
@ -135,8 +147,14 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
// TODO: get sub from multi?
|
||||
multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
|
||||
if (multiDv == null) {
|
||||
if (countOnly) {
|
||||
continue;
|
||||
} else {
|
||||
multiDv = DocValues.emptySortedSet();
|
||||
}
|
||||
} else if (countOnly && multiDv.getValueCount() < 1){
|
||||
continue;
|
||||
}
|
||||
// some codecs may optimize SortedSet storage for single-valued fields
|
||||
// this will be null if this is not a wrapped single valued docvalues.
|
||||
if (unwrap_singleValued_multiDv) {
|
||||
|
@ -145,11 +163,15 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
} else {
|
||||
singleDv = subCtx.reader().getSortedDocValues(sf.getName());
|
||||
if (singleDv == null) {
|
||||
if (countOnly) {
|
||||
continue;
|
||||
} else {
|
||||
singleDv = DocValues.emptySorted();
|
||||
}
|
||||
} else if (countOnly && singleDv.getValueCount() < 1) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
|
||||
|
||||
if (singleDv != null) {
|
||||
if (accumSeg) {
|
||||
|
@ -174,7 +196,7 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
}
|
||||
}
|
||||
|
||||
reuse = null; // better GC
|
||||
Arrays.fill(reuse, null); // better GC
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -182,9 +204,9 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
return si.lookupOrd(ord);
|
||||
}
|
||||
|
||||
private void collectPerSeg(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int segMax = singleDv.getValueCount() + 1;
|
||||
final int[] counts = getCountArr( segMax );
|
||||
private void collectPerSeg(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
|
||||
int segMax = singleDv.getValueCount();
|
||||
final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);
|
||||
|
||||
/** alternate trial implementations
|
||||
// ord
|
||||
|
@ -202,73 +224,110 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {
|
||||
FieldCacheImpl.SortedDocValuesImpl.Iter fc = (FieldCacheImpl.SortedDocValuesImpl.Iter) singleDv;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
counts[fc.getOrd(doc) + 1]++;
|
||||
final int segOrd = fc.getOrd(doc);
|
||||
if (segOrd >= 0) {
|
||||
final int maxIdx = disi.registerCounts(segCounter);
|
||||
segCounter.incrementCount(segOrd, 1, maxIdx);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (singleDv.advanceExact(doc)) {
|
||||
counts[singleDv.ordValue() + 1]++;
|
||||
final int segOrd = singleDv.ordValue();
|
||||
if (segOrd >= 0) {
|
||||
final int maxIdx = disi.registerCounts(segCounter);
|
||||
segCounter.incrementCount(segOrd, 1, maxIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// convert segment-local counts to global counts
|
||||
for (int i=1; i<segMax; i++) {
|
||||
int segCount = counts[i];
|
||||
if (segCount > 0) {
|
||||
int slot = toGlobal == null ? (i - 1) : (int) toGlobal.get(i - 1);
|
||||
countAcc.incrementCount(slot, segCount);
|
||||
}
|
||||
}
|
||||
segCounter.register(disi.countAccs, toGlobal, segMax - 1);
|
||||
}
|
||||
|
||||
private void collectPerSeg(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
private SegCountPerSeg getSegCountPerSeg(SweepDISI disi, int segMax) {
|
||||
final int size = disi.size;
|
||||
return new SegCountPerSeg(getSegmentCountArrays(segMax, size), getBoolArr(segMax), segMax, size);
|
||||
}
|
||||
|
||||
private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedDocValues dv) {
|
||||
return new SegCountGlobal(disi.countAccs);
|
||||
}
|
||||
|
||||
private SegCountGlobal getSegCountGlobal(SweepDISI disi, SortedSetDocValues dv) {
|
||||
return new SegCountGlobal(disi.countAccs);
|
||||
}
|
||||
|
||||
private void collectPerSeg(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
|
||||
int segMax = (int)multiDv.getValueCount();
|
||||
final int[] counts = getCountArr( segMax );
|
||||
final SegCountPerSeg segCounter = getSegCountPerSeg(disi, segMax);
|
||||
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (multiDv.advanceExact(doc)) {
|
||||
for(;;) {
|
||||
final int maxIdx = disi.registerCounts(segCounter);
|
||||
for (;;) {
|
||||
int segOrd = (int)multiDv.nextOrd();
|
||||
if (segOrd < 0) break;
|
||||
counts[segOrd]++;
|
||||
segCounter.incrementCount(segOrd, 1, maxIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<segMax; i++) {
|
||||
int segCount = counts[i];
|
||||
if (segCount > 0) {
|
||||
int slot = toGlobal == null ? (i) : (int) toGlobal.get(i);
|
||||
countAcc.incrementCount(slot, segCount);
|
||||
}
|
||||
}
|
||||
segCounter.register(disi.countAccs, toGlobal, segMax - 1);
|
||||
}
|
||||
|
||||
private int[] reuse;
|
||||
private int[] getCountArr(int maxNeeded) {
|
||||
if (reuse == null) {
|
||||
private boolean[] reuseBool;
|
||||
private boolean[] getBoolArr(int maxNeeded) {
|
||||
if (reuseBool == null) {
|
||||
// make the count array large enough for any segment
|
||||
// FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
|
||||
reuse = new int[(int) si.getValueCount() + 1];
|
||||
reuseBool = new boolean[(int) si.getValueCount() + 1];
|
||||
} else {
|
||||
Arrays.fill(reuse, 0, maxNeeded, 0);
|
||||
Arrays.fill(reuseBool, 0, maxNeeded, false);
|
||||
}
|
||||
return reuse;
|
||||
return reuseBool;
|
||||
}
|
||||
|
||||
private void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
private int[][] reuse = new int[12][];
|
||||
private int[] getCountArr(int maxNeeded, int idx) {
|
||||
if (idx >= reuse.length) {
|
||||
reuse = Arrays.copyOf(reuse, idx + 1);
|
||||
}
|
||||
if (reuse[idx] == null) {
|
||||
// make the count array large enough for any segment
|
||||
// FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
|
||||
reuse[idx] = new int[(int) si.getValueCount() + 1];
|
||||
} else {
|
||||
Arrays.fill(reuse[idx], 0, maxNeeded, 0);
|
||||
}
|
||||
return reuse[idx];
|
||||
}
|
||||
|
||||
private int[][] getSegmentCountArrays(int segMax, int size) {
|
||||
int[][] ret = new int[size][];
|
||||
int i = size - 1;
|
||||
do {
|
||||
ret[i] = getCountArr(segMax, i);
|
||||
} while (i-- > 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void collectDocs(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
|
||||
int doc;
|
||||
final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (singleDv.advanceExact(doc)) {
|
||||
final int maxIdx = disi.registerCounts(segCounter);
|
||||
int segOrd = singleDv.ordValue();
|
||||
collect(doc, segOrd, toGlobal);
|
||||
collect(doc, segOrd, toGlobal, segCounter, maxIdx, disi.collectBase());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collectCounts(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
private void collectCounts(SortedDocValues singleDv, SweepDISI disi, LongValues toGlobal) throws IOException {
|
||||
final SegCountGlobal segCounter = getSegCountGlobal(disi, singleDv);
|
||||
int doc;
|
||||
if (singleDv instanceof FieldCacheImpl.SortedDocValuesImpl.Iter) {
|
||||
|
||||
|
@ -277,7 +336,8 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
int segOrd = fc.getOrd(doc);
|
||||
if (segOrd < 0) continue;
|
||||
int ord = (int)toGlobal.get(segOrd);
|
||||
countAcc.incrementCount(ord, 1);
|
||||
int maxIdx = disi.registerCounts(segCounter);
|
||||
segCounter.incrementCount(ord, 1, maxIdx);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
@ -286,48 +346,54 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
if (singleDv.advanceExact(doc)) {
|
||||
int segOrd = singleDv.ordValue();
|
||||
int ord = (int) toGlobal.get(segOrd);
|
||||
countAcc.incrementCount(ord, 1);
|
||||
int maxIdx = disi.registerCounts(segCounter);
|
||||
segCounter.incrementCount(ord, 1, maxIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
private void collectDocs(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
|
||||
final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (multiDv.advanceExact(doc)) {
|
||||
final int maxIdx = disi.registerCounts(segCounter);
|
||||
final boolean collectBase = disi.collectBase();
|
||||
for(;;) {
|
||||
int segOrd = (int)multiDv.nextOrd();
|
||||
if (segOrd < 0) break;
|
||||
collect(doc, segOrd, toGlobal);
|
||||
collect(doc, segOrd, toGlobal, segCounter, maxIdx, collectBase);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collectCounts(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
private void collectCounts(SortedSetDocValues multiDv, SweepDISI disi, LongValues toGlobal) throws IOException {
|
||||
final SegCountGlobal segCounter = getSegCountGlobal(disi, multiDv);
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (multiDv.advanceExact(doc)) {
|
||||
final int maxIdx = disi.registerCounts(segCounter);
|
||||
for(;;) {
|
||||
int segOrd = (int)multiDv.nextOrd();
|
||||
if (segOrd < 0) break;
|
||||
int ord = (int)toGlobal.get(segOrd);
|
||||
countAcc.incrementCount(ord, 1);
|
||||
segCounter.incrementCount(ord, 1, maxIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException {
|
||||
private void collect(int doc, int segOrd, LongValues toGlobal, SegCountGlobal segCounter, int maxIdx, boolean collectBase) throws IOException {
|
||||
int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd;
|
||||
|
||||
int arrIdx = ord - startTermIndex;
|
||||
// This code handles faceting prefixes, which narrows the range of ords we want to collect.
|
||||
// It’s not an error for an ord to fall outside this range… we simply want to skip it.
|
||||
if (arrIdx >= 0 && arrIdx < nTerms) {
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
segCounter.incrementCount(arrIdx, 1, maxIdx);
|
||||
if (collectBase) {
|
||||
if (collectAcc != null) {
|
||||
collectAcc.collect(doc, arrIdx, slotContext);
|
||||
}
|
||||
|
@ -336,5 +402,6 @@ class FacetFieldProcessorByArrayDV extends FacetFieldProcessorByArray {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -310,7 +310,6 @@ public abstract class FacetProcessor<FacetRequestT extends FacetRequest> {
|
|||
// allow a custom count acc to be used
|
||||
if (countAcc == null) {
|
||||
countAcc = new SlotAcc.CountSlotArrAcc(fcontext, slotCount);
|
||||
countAcc.key = "count";
|
||||
}
|
||||
|
||||
for (Map.Entry<String,AggValueSource> entry : freq.getFacetStats().entrySet()) {
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* To be implemented by CountSlotAccs that wish to expose a read-only interface
|
||||
*/
|
||||
interface ReadOnlyCountSlotAcc {
|
||||
|
||||
public long getCount(int slot);
|
||||
|
||||
public int compare(int slotA, int slotB);
|
||||
|
||||
public Object getValue(int slotNum) throws IOException;
|
||||
}
|
|
@ -37,6 +37,7 @@ import org.apache.solr.common.util.NamedList;
|
|||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.QParser;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepableSlotAcc;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -56,6 +57,7 @@ public class RelatednessAgg extends AggValueSource {
|
|||
private static final String RELATEDNESS = "relatedness";
|
||||
private static final String FG_POP = "foreground_popularity";
|
||||
private static final String BG_POP = "background_popularity";
|
||||
public static final String SWEEP_COLLECTION = "sweep_collection";
|
||||
|
||||
// needed for distrib calculation
|
||||
private static final String FG_SIZE = "foreground_size";
|
||||
|
@ -66,8 +68,11 @@ public class RelatednessAgg extends AggValueSource {
|
|||
final protected Query fgQ;
|
||||
final protected Query bgQ;
|
||||
protected double min_pop = 0.0D;
|
||||
private Boolean useSweep;
|
||||
|
||||
public static final String NAME = RELATEDNESS;
|
||||
private static final boolean DEFAULT_SWEEP_COLLECTION = true;
|
||||
|
||||
public RelatednessAgg(Query fgQ, Query bgQ) {
|
||||
super(NAME);
|
||||
// NOTE: ideally we don't want to assume any defaults *yet* if fgQ/bgQ are null
|
||||
|
@ -87,7 +92,10 @@ public class RelatednessAgg extends AggValueSource {
|
|||
public void setOpts(QParser parser) {
|
||||
final boolean isShard = parser.getReq().getParams().getBool(ShardParams.IS_SHARD, false);
|
||||
SolrParams opts = parser.getLocalParams();
|
||||
if (null != opts) {
|
||||
if (null == opts) {
|
||||
this.useSweep = DEFAULT_SWEEP_COLLECTION;
|
||||
} else {
|
||||
this.useSweep = opts.getBool(SWEEP_COLLECTION, DEFAULT_SWEEP_COLLECTION);
|
||||
if (!isShard) { // ignore min_pop if this is a shard request
|
||||
this.min_pop = opts.getDouble("min_popularity", 0.0D);
|
||||
}
|
||||
|
@ -97,7 +105,7 @@ public class RelatednessAgg extends AggValueSource {
|
|||
@Override
|
||||
public String description() {
|
||||
// TODO: need better output processing when we start supporting null fgQ/bgQ in constructor
|
||||
return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop+")";
|
||||
return name +"(fgQ=" + fgQ + ",bgQ=" + bgQ + ",min_pop="+min_pop + ",useSweep="+useSweep+")";
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -163,9 +171,127 @@ public class RelatednessAgg extends AggValueSource {
|
|||
return new Merger(this);
|
||||
}
|
||||
|
||||
private static final class SweepSKGSlotAcc extends SlotAcc {
|
||||
|
||||
private final int minCount; // pre-calculate for a given min_popularity
|
||||
private final long fgSize;
|
||||
private final long bgSize;
|
||||
private final ReadOnlyCountSlotAcc fgCount;
|
||||
private final ReadOnlyCountSlotAcc bgCount;
|
||||
private double[] relatedness;
|
||||
|
||||
private static final int NO_ALL_BUCKETS = -2;
|
||||
private static final int ALL_BUCKETS_UNINITIALIZED = -1;
|
||||
|
||||
// we can't get the allBuckets info from the slotContext in collect(), b/c the whole point of
|
||||
// sweep collection is that the "collect" methods aren't called.
|
||||
// So this is the compromise: note in construction either that we're using a processor w/NO_ALL_BUCKETS
|
||||
// or that we don't know the bucket yet (ALL_BUCKETS_UNINITIALIZED) and fill it in in getValues
|
||||
// where we can check against the processor
|
||||
private int allBucketsSlot;
|
||||
|
||||
public SweepSKGSlotAcc(double minPopularity, FacetContext fcontext, int numSlots, long fgSize, long bgSize, ReadOnlyCountSlotAcc fgCount, ReadOnlyCountSlotAcc bgCount) {
|
||||
super(fcontext);
|
||||
this.minCount = (int) Math.ceil(minPopularity * bgSize);
|
||||
this.fgSize = fgSize;
|
||||
this.bgSize = bgSize;
|
||||
this.fgCount = fgCount;
|
||||
this.bgCount = bgCount;
|
||||
relatedness = new double[numSlots];
|
||||
Arrays.fill(relatedness, 0, numSlots, Double.NaN);
|
||||
|
||||
// any processor that can (currently) result in the use of SweepSKGSlotAcc *should* be a
|
||||
// FacetFieldProcessor -- but don't assume that will always be true...
|
||||
this.allBucketsSlot = NO_ALL_BUCKETS;
|
||||
if (fcontext.processor instanceof FacetFieldProcessor
|
||||
// NOTE: if this instanceof/cast changes, getValues needs updated as well
|
||||
&& ((FacetFieldProcessor)fcontext.processor).freq.allBuckets) {
|
||||
this.allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int perSegDocId, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int collect(DocSet docs, int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
throw new UnsupportedOperationException("collect() not supported, this SlotAcc impl only usable for sweeping");
|
||||
}
|
||||
|
||||
private double getRelatedness(int slot) {
|
||||
final double cachedRelatedness = relatedness[slot];
|
||||
if (Double.isNaN(cachedRelatedness)) {
|
||||
final long fg_count = fgCount.getCount(slot);
|
||||
final long bg_count = bgCount.getCount(slot);
|
||||
if (minCount > 0) {
|
||||
// if min_pop is configured, and either (fg|bg) popularity is lower then that value
|
||||
// then "this.relatedness=-Infinity" so it sorts below any "valid" relatedness scores
|
||||
if (fg_count < minCount || bg_count < minCount) {
|
||||
return relatedness[slot] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
}
|
||||
return relatedness[slot] = computeRelatedness(fg_count, fgSize, bg_count, bgSize);
|
||||
} else {
|
||||
return cachedRelatedness;
|
||||
}
|
||||
}
|
||||
|
||||
public int compare(int slotA, int slotB) {
|
||||
int r = Double.compare(getRelatedness(slotA), getRelatedness(slotB));
|
||||
if (0 == r) {
|
||||
r = Long.compare(fgCount.getCount(slotA), fgCount.getCount(slotB));
|
||||
}
|
||||
if (0 == r) {
|
||||
r = Long.compare(bgCount.getCount(slotA), bgCount.getCount(slotB));
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getValue(int slotNum) {
|
||||
final BucketData slotVal;
|
||||
if (NO_ALL_BUCKETS != allBucketsSlot) {
|
||||
// there's no reason why a processor should be resizing SlotAccs in the middle of getValue,
|
||||
// but we're going to be vigilent against that possibility just in case...
|
||||
if (ALL_BUCKETS_UNINITIALIZED == allBucketsSlot
|
||||
|| allBucketsSlot == slotNum) {
|
||||
assert fcontext.processor instanceof FacetFieldProcessor
|
||||
: "code changed, non FacetFieldProcessor sweeping w/allBuckets?!?";
|
||||
allBucketsSlot = ((FacetFieldProcessor)fcontext.processor).allBucketsAcc.collectAccSlot;
|
||||
}
|
||||
}
|
||||
if (slotNum == allBucketsSlot) {
|
||||
slotVal = new BucketData(null);
|
||||
} else {
|
||||
slotVal = new BucketData(fgCount.getCount(slotNum), fgSize, bgCount.getCount(slotNum), bgSize, getRelatedness(slotNum));
|
||||
}
|
||||
return slotVal.externalize(fcontext.isShard());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
Arrays.fill(relatedness, Double.NaN);
|
||||
if (allBucketsSlot != NO_ALL_BUCKETS) {
|
||||
allBucketsSlot = ALL_BUCKETS_UNINITIALIZED;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resize(Resizer resizer) {
|
||||
relatedness = resizer.resize(relatedness, Double.NaN);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
relatedness = null;
|
||||
}
|
||||
}
|
||||
|
||||
private static final String IMPLIED_KEY = "implied";
|
||||
|
||||
private static final class SKGSlotAcc extends SlotAcc {
|
||||
private static final class SKGSlotAcc extends SlotAcc implements SweepableSlotAcc<SlotAcc> {
|
||||
private final RelatednessAgg agg;
|
||||
private BucketData[] slotvalues;
|
||||
private final DocSet fgSet;
|
||||
|
@ -181,9 +307,30 @@ public class RelatednessAgg extends AggValueSource {
|
|||
// cache the set sizes for frequent re-use on every slot
|
||||
this.fgSize = fgSet.size();
|
||||
this.bgSize = bgSet.size();
|
||||
this.slotvalues = new BucketData[numSlots];
|
||||
this.slotvalues = new BucketData[numSlots]; //TODO: avoid initializing array until we know we're not doing sweep collection?
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* If called, may register SweepingAccs for fg and bg set based on whether
|
||||
* user indicated sweeping should be used (default)
|
||||
*
|
||||
* @returns null if any SweepingAccs were registered since no other collection is needed for relatedness
|
||||
*/
|
||||
@Override
|
||||
public SKGSlotAcc registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc) {
|
||||
if (!this.agg.useSweep) {
|
||||
return this;
|
||||
} else {
|
||||
final ReadOnlyCountSlotAcc fgCount = baseSweepingAcc.add(key + "!fg", fgSet, slotvalues.length);
|
||||
final ReadOnlyCountSlotAcc bgCount = baseSweepingAcc.add(key + "!bg", bgSet, slotvalues.length);
|
||||
SweepSKGSlotAcc readOnlyReplacement = new SweepSKGSlotAcc(agg.min_pop, fcontext, slotvalues.length, fgSize, bgSize, fgCount, bgCount);
|
||||
readOnlyReplacement.key = key;
|
||||
baseSweepingAcc.registerMapping(this, readOnlyReplacement);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private void processSlot(int slot, IntFunction<SlotContext> slotContext) throws IOException {
|
||||
|
||||
assert null != slotContext;
|
||||
|
@ -213,7 +360,12 @@ public class RelatednessAgg extends AggValueSource {
|
|||
assert null == fcontext.filter;
|
||||
}
|
||||
// ...and in which case we should just use the current base
|
||||
final DocSet slotSet = null == slotQ ? fcontext.base : fcontext.searcher.getDocSet(slotQ);
|
||||
final DocSet slotSet;
|
||||
if (null == slotQ) {
|
||||
slotSet = fcontext.base;
|
||||
} else {
|
||||
slotSet = fcontext.searcher.getDocSet(slotQ);
|
||||
}
|
||||
|
||||
slotVal.incSizes(fgSize, bgSize);
|
||||
slotVal.incCounts(fgSet.intersectionSize(slotSet),
|
||||
|
@ -334,6 +486,16 @@ public class RelatednessAgg extends AggValueSource {
|
|||
this.implied = true;
|
||||
}
|
||||
|
||||
public BucketData(long fg_count, long fg_size, long bg_count, long bg_size, double relatedness) {
|
||||
this.fg_count = fg_count;
|
||||
this.fg_size = fg_size;
|
||||
this.fg_pop = roundTo5Digits((double) fg_count / bg_size); // yes, BACKGROUND size is intentional
|
||||
this.bg_count = bg_count;
|
||||
this.bg_size = bg_size;
|
||||
this.bg_pop = roundTo5Digits((double) bg_count / bg_size);
|
||||
this.relatedness = relatedness;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment both the foreground & background <em>counts</em> for the current bucket, reseting any
|
||||
* derived values that may be cached
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
|
||||
|
||||
final class SingletonDISI extends SweepDISI {
|
||||
|
||||
private final DocIdSetIterator backing;
|
||||
private final boolean isBase;
|
||||
|
||||
SingletonDISI(DocIdSetIterator backing, CountSlotAcc[] countAccs, boolean isBase) {
|
||||
super(1, countAccs);
|
||||
this.backing = backing;
|
||||
this.isBase = isBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return backing.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collectBase() {
|
||||
return isBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int registerCounts(SegCounter segCounter) {
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import org.apache.solr.search.DocIterator;
|
||||
|
||||
final class SingletonDocIterator extends SweepDocIterator {
|
||||
|
||||
private final DocIterator backing;
|
||||
private final boolean isBase;
|
||||
|
||||
SingletonDocIterator(DocIterator backing, boolean isBase) {
|
||||
super(1);
|
||||
this.backing = backing;
|
||||
this.isBase = isBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return backing.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() {
|
||||
return backing.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collectBase() {
|
||||
return isBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int registerCounts(SegCounter segCounts) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.lang.reflect.Array;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.function.IntFunction;
|
||||
|
@ -52,6 +53,8 @@ public abstract class SlotAcc implements Closeable {
|
|||
this.fcontext = fcontext;
|
||||
}
|
||||
|
||||
@Override public String toString() { return key; }
|
||||
|
||||
/**
|
||||
* NOTE: this currently detects when it is being reused and calls resetIterators by comparing reader ords
|
||||
* with previous calls to setNextReader. For this reason, current users must call setNextReader
|
||||
|
@ -597,9 +600,222 @@ public abstract class SlotAcc implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
abstract static class CountSlotAcc extends SlotAcc {
|
||||
/**
|
||||
* Implemented by some SlotAccs if they are capable of being used for
|
||||
* sweep collecting in compatible facet processors
|
||||
* @see FacetFieldProcessor#registerSweepingAccIfSupportedByCollectAcc()
|
||||
*/
|
||||
static interface SweepableSlotAcc<T extends SlotAcc> {
|
||||
/**
|
||||
* Called by processors if they support sweeping. Implementations will often
|
||||
* return self or null (the latter indicating that all necessary collection will
|
||||
* be covered by the "sweeping" data structures registered with the specified
|
||||
* baseSweepingAcc as a result of the call to this method).
|
||||
*
|
||||
* If an implementing instance chooses to replace itself with another {@link SlotAcc}, it must
|
||||
* call {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)} on the specified
|
||||
* baseSweepingAcc to notify it of the mapping from original SlotAcc to the SlotAcc that should
|
||||
* be used for purposes of read access. It is the responsibility of the specified {@link SweepingCountSlotAcc}
|
||||
* to ensure proper placement/accessibility of the SlotAcc to be used for read access.
|
||||
*
|
||||
* The replacement SlotAcc registered via {@link SweepingCountSlotAcc#registerMapping(SlotAcc, SlotAcc)}
|
||||
* will be responsible for output via its {@link SlotAcc#setValues(SimpleOrderedMap, int)} method.
|
||||
* An implementer of this method may register such a replacement, and also return a non-null
|
||||
* SlotAcc to be used for normal collection (via {@link FacetFieldProcessor#collectAcc}). In this case,
|
||||
* the implementer should take care that the returned {@link SlotAcc} is different from the {@link SlotAcc}
|
||||
* registered for the purpose of output -- with the former overriding {@link SlotAcc#setValues(SimpleOrderedMap, int)}
|
||||
* as a no-op, to prevent setting duplicate values.
|
||||
*
|
||||
* @param baseSweepingAcc - never null, where the SlotAcc may register domains for sweep collection,
|
||||
* and must register mappings of new read-access SlotAccs that result from this call.
|
||||
* @return SlotAcc to be used for purpose of collection. If null then collect methods will
|
||||
* never be called on this SlotAcc.
|
||||
*/
|
||||
public T registerSweepingAccs(SweepingCountSlotAcc baseSweepingAcc);
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple data structure to {@link DocSet} domains with an associated {@link CountSlotAcc}. This may be used
|
||||
* to support sweep count accumulation over different {@link DocSet} domains, but the concept is perfectly applicable
|
||||
* to encapsulating the relevant state for simple "non-sweep" collection as well (in which case {@link SweepCountAccStruct#docSet}
|
||||
* would be {@link FacetContext#base}, {@link SweepCountAccStruct#countAcc} would be {@link FacetProcessor#countAcc}, and
|
||||
* {@link SweepCountAccStruct#isBase} would trivially be "true").
|
||||
*/
|
||||
static final class SweepCountAccStruct {
|
||||
final DocSet docSet;
|
||||
final boolean isBase;
|
||||
final CountSlotAcc countAcc;
|
||||
public SweepCountAccStruct(DocSet docSet, boolean isBase, CountSlotAcc countAcc) {
|
||||
this.docSet = docSet;
|
||||
this.isBase = isBase;
|
||||
this.countAcc = countAcc;
|
||||
}
|
||||
public SweepCountAccStruct(SweepCountAccStruct t, DocSet replaceDocSet) {
|
||||
this.docSet = replaceDocSet;
|
||||
this.isBase = t.isBase;
|
||||
this.countAcc = t.countAcc;
|
||||
}
|
||||
/**
|
||||
* Because sweep collection offloads "collect" methods to count accumulation code,
|
||||
* it is helpful to provide a read-only view over the backing {@link CountSlotAcc}
|
||||
*
|
||||
* @return - a read-only view over {@link #countAcc}
|
||||
*/
|
||||
public ReadOnlyCountSlotAcc roCountAcc() {
|
||||
return countAcc;
|
||||
}
|
||||
@Override public String toString() {
|
||||
return this.countAcc.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Special CountSlotAcc used by processors that support sweeping to decide what to sweep over and how to "collect"
|
||||
* when doing the sweep.
|
||||
*
|
||||
* This class may be used by instances of {@link SweepableSlotAcc} to register DocSet domains (via {@link SweepingCountSlotAcc#add})
|
||||
* over which to sweep-collect facet counts.
|
||||
*
|
||||
* @see SweepableSlotAcc#registerSweepingAccs
|
||||
*/
|
||||
static class SweepingCountSlotAcc extends CountSlotArrAcc {
|
||||
|
||||
static final String SWEEP_COLLECTION_DEBUG_KEY = "sweep_collection";
|
||||
private final SimpleOrderedMap<Object> debug;
|
||||
private final FacetFieldProcessor p;
|
||||
final SweepCountAccStruct base;
|
||||
final List<SweepCountAccStruct> others = new ArrayList<>();
|
||||
private final List<SlotAcc> output = new ArrayList<>();
|
||||
|
||||
SweepingCountSlotAcc(int numSlots, FacetFieldProcessor p) {
|
||||
super(p.fcontext, numSlots);
|
||||
this.p = p;
|
||||
this.base = new SweepCountAccStruct(fcontext.base, true, this);
|
||||
final FacetDebugInfo fdebug = fcontext.getDebugInfo();
|
||||
this.debug = null != fdebug ? new SimpleOrderedMap<>() : null;
|
||||
if (null != this.debug) {
|
||||
fdebug.putInfoItem(SWEEP_COLLECTION_DEBUG_KEY, debug);
|
||||
debug.add("base", key);
|
||||
debug.add("accs", new ArrayList<String>());
|
||||
debug.add("mapped", new ArrayList<String>());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Called by SweepableSlotAccs to register new DocSet domains for sweep collection
|
||||
*
|
||||
* @param key
|
||||
* assigned to the returned SlotAcc, and used for debugging
|
||||
* @param docs
|
||||
* the domain over which to sweep
|
||||
* @param numSlots
|
||||
* the number of slots
|
||||
* @return a read-only representation of the count acc which is guaranteed to be populated after sweep count
|
||||
* collection
|
||||
*/
|
||||
public ReadOnlyCountSlotAcc add(String key, DocSet docs, int numSlots) {
|
||||
final CountSlotAcc count = new CountSlotArrAcc(fcontext, numSlots);
|
||||
count.key = key;
|
||||
final SweepCountAccStruct ret = new SweepCountAccStruct(docs, false, count);
|
||||
if (null != debug) {
|
||||
@SuppressWarnings("unchecked")
|
||||
List<String> accsDebug = (List<String>) debug.get("accs");
|
||||
accsDebug.add(ret.toString());
|
||||
}
|
||||
others.add(ret);
|
||||
return ret.roCountAcc();
|
||||
}
|
||||
|
||||
/**
|
||||
* When a {@link SweepableSlotAcc} replaces itself (for the purpose of collection) with a different {@link SlotAcc}
|
||||
* instance, it must register that replacement by calling this method with itself as the fromAcc param, and with the
|
||||
* new replacement {@link SlotAcc} as the toAcc param. The two SlotAccs must have the same {@link SlotAcc#key}.
|
||||
*
|
||||
* It is the responsibility of this method to insure that {@link FacetFieldProcessor} references to fromAcc (other than
|
||||
* those within {@link FacetFieldProcessor#collectAcc}, which are set directly by the return value of
|
||||
* {@link SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc)}) are replaced
|
||||
* by references to toAcc. Such references would include, e.g., {@link FacetFieldProcessor#sortAcc}.
|
||||
*
|
||||
* It is also this method's responsibility to insure that read access to toAcc (via toAcc's {@link SlotAcc#setValues(SimpleOrderedMap, int)}
|
||||
* method) is provided via this instance's {@link #setValues(SimpleOrderedMap, int)} method.
|
||||
*
|
||||
* @param fromAcc - the {@link SlotAcc} to be replaced (this will normally be the caller of this method).
|
||||
* @param toAcc - the replacement {@link SlotAcc}
|
||||
*
|
||||
* @see SweepableSlotAcc#registerSweepingAccs(SweepingCountSlotAcc)
|
||||
*/
|
||||
public void registerMapping(SlotAcc fromAcc, SlotAcc toAcc) {
|
||||
assert fromAcc.key.equals(toAcc.key);
|
||||
output.add(toAcc);
|
||||
if (p.sortAcc == fromAcc) {
|
||||
p.sortAcc = toAcc;
|
||||
}
|
||||
if (null != debug) {
|
||||
@SuppressWarnings("unchecked")
|
||||
List<String> mappedDebug = (List<String>) debug.get("mapped");
|
||||
mappedDebug.add(fromAcc.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Always populates the bucket with the current count for that slot. If the count is positive, or if
|
||||
* <code>processEmpty==true</code>, then this method also populates the values from mapped "output" accumulators.
|
||||
*
|
||||
* @see #setSweepValues
|
||||
*/
|
||||
@Override
|
||||
public void setValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
|
||||
super.setValues(bucket, slotNum);
|
||||
if (0 < getCount(slotNum) || fcontext.processor.freq.processEmpty) {
|
||||
setSweepValues(bucket, slotNum);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Populates the bucket with the values from all mapped "output" accumulators for the specified slot.
|
||||
*
|
||||
* This method exists because there are some contexts (namely SpecialSlotAcc, for allBuckets, etc.) in which "base"
|
||||
* count is tracked differently, via getSpecialCount(). For such cases, we need a method that allows the caller to
|
||||
* directly coordinate calling {@link SlotAcc#setValues} on the sweeping output accs, while avoiding the inclusion
|
||||
* of {@link CountSlotAcc#setValues CountSlotAcc.setValues}
|
||||
*/
|
||||
public void setSweepValues(SimpleOrderedMap<Object> bucket, int slotNum) throws IOException {
|
||||
for (SlotAcc acc : output) {
|
||||
acc.setValues(bucket, slotNum);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method for code that wants to operating in a sweeping manner even if the current processor
|
||||
* is not using sweeping.
|
||||
*
|
||||
* @returns struct that wraps the {@link FacetContext#base} unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc}
|
||||
*/
|
||||
public static SweepCountAccStruct baseStructOf(FacetProcessor<?> processor) {
|
||||
if (processor.countAcc instanceof SweepingCountSlotAcc) {
|
||||
return ((SweepingCountSlotAcc) processor.countAcc).base;
|
||||
}
|
||||
return new SweepCountAccStruct(processor.fcontext.base, true, processor.countAcc);
|
||||
}
|
||||
/**
|
||||
* Helper method for code that wants to operating in a sweeping manner even if the current processor
|
||||
* is not using sweeping
|
||||
*
|
||||
* @returns empty list unless the {@link FacetProcessor#countAcc} is a {@link SweepingCountSlotAcc}
|
||||
*/
|
||||
public static List<SweepCountAccStruct> otherStructsOf(FacetProcessor<?> processor) {
|
||||
if (processor.countAcc instanceof SweepingCountSlotAcc) {
|
||||
return ((SweepingCountSlotAcc) processor.countAcc).others;
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
abstract static class CountSlotAcc extends SlotAcc implements ReadOnlyCountSlotAcc {
|
||||
public CountSlotAcc(FacetContext fcontext) {
|
||||
super(fcontext);
|
||||
// assume we are the 'count' by default unless/untill our creator overrides this
|
||||
this.key = "count";
|
||||
}
|
||||
|
||||
public abstract void incrementCount(int slot, long count);
|
||||
|
|
|
@ -0,0 +1,187 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
|
||||
|
||||
/**
|
||||
* Implemented by extensions of doc iterators (i.e., {@link DocIdSetIterator}, {@link DocIterator} over one or
|
||||
* more domain, to support facet count accumulation corresponding to each domain (and via {@link #collectBase()}
|
||||
* to inform the necessity of "collection" for a single optional backing "base" set).
|
||||
*/
|
||||
interface SweepCountAware {
|
||||
|
||||
/**
|
||||
* Returns true if one of the domains underlying this iterator is the "base" domain, and if that base domain
|
||||
* contains the doc on which this iterator is currently positioned. If "true", then "collection" may be necessary
|
||||
* for the current doc.
|
||||
*
|
||||
* For each iterator position (each doc), {@link #registerCounts(SegCounter)} must be called before this method.
|
||||
*/
|
||||
boolean collectBase();
|
||||
|
||||
/**
|
||||
* Called on a positioned doc iterator to register array index mappings for domains that contain the current
|
||||
* doc. Implementations register these mappings by calling {@link SegCounter#map(int, int)} on the specified
|
||||
* segCounts param.
|
||||
*
|
||||
* For each iterator position, this method must be called before {@link #collectBase()}
|
||||
*
|
||||
* @param segCounts - to register mappings of array indices for domains that contain this doc
|
||||
* @return - the max index of an array representing the domains that contain the current doc. If "n" domains
|
||||
* contain the current doc, the return value would be "n - 1"
|
||||
* @throws IOException - if thrown by advancing an underlying doc iterator
|
||||
*/
|
||||
int registerCounts(SegCounter segCounts) throws IOException;
|
||||
|
||||
/**
|
||||
* Used to coordinate multiple count accumulations over multiple domains. Implementers will have "n" backing term-ord-indexed
|
||||
* counts -- one for each domain over which count accumulation is to be performed. For each doc, count accumulation
|
||||
* takes place in two phases, invoked by a "driver" (e.g., {@link FacetFieldProcessor}) that manages iteration over the
|
||||
* union of doc domains:
|
||||
*
|
||||
* First, the driver passes this object as the param to {@link SweepCountAware#registerCounts(SegCounter)}, which
|
||||
* calls {@link #map(int, int)} on "this" to map the static "allIdx" (allIdx < n) for each active backing domain to
|
||||
* a transient "activeIdx" for counts corresponding to active domains (activeIdx < count(allIdx) <= n). (The return value
|
||||
* of {@link SweepCountAware#registerCounts(SegCounter)} indicates to the "driver" the max "active counts" index (for
|
||||
* domains that contain the current doc).
|
||||
*
|
||||
* The driver then calls {@link #incrementCount(int, int, int)}, passing the term ord, increment amount (usually "1"),
|
||||
* and the max "active counts" index returned from {@link SweepCountAware#registerCounts(SegCounter)} in the first
|
||||
* phase. The "max active counts index" param is used as the limit (inclusive) to iterate count accumulation over each
|
||||
* of the "active" domains for the current doc.
|
||||
*
|
||||
* @see SweepCountAware#registerCounts(SegCounter)
|
||||
*/
|
||||
static interface SegCounter {
|
||||
/**
|
||||
* Mark/map a given domain/CountSlotAcc as active (eligible for count accumulation) for the current doc.
|
||||
*
|
||||
* @param allIdx - the static index of the domain/CountSlotAcc to be "activated" for the current doc
|
||||
* @param activeIdx - the transient "active index" (for the purpose of actual count accumulation) to which to map
|
||||
* the domain/CountSlotAcc indicated by "allIdx".
|
||||
*/
|
||||
void map(int allIdx, int activeIdx);
|
||||
|
||||
/**
|
||||
* Increments counts for active domains/CountSlotAccs.
|
||||
*
|
||||
* @param ord - the term ord (either global ord per-seg) for which to increment counts
|
||||
* @param inc - the amount by which to increment the count for the specified term ord
|
||||
* @param maxIdx - the max index (inclusive) of active domains/CountSlotAccs to be incremented for the current doc
|
||||
*/
|
||||
void incrementCount(int ord, int inc, int maxIdx);
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is designed to count over global term ords ({@link SegCountPerSeg} provides equivalent functionality for
|
||||
* per-segment term ords).
|
||||
*
|
||||
* @see SegCountPerSeg
|
||||
*/
|
||||
static class SegCountGlobal implements SegCounter {
|
||||
private final CountSlotAcc[] allCounts;
|
||||
private final CountSlotAcc[] activeCounts;
|
||||
|
||||
public SegCountGlobal(CountSlotAcc[] allCounts) {
|
||||
this.allCounts = allCounts;
|
||||
this.activeCounts = Arrays.copyOf(allCounts, allCounts.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void map(int allIdx, int activeIdx) {
|
||||
activeCounts[activeIdx] = allCounts[allIdx];
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void incrementCount(int globalOrd, int inc, int maxIdx) {
|
||||
int i = maxIdx;
|
||||
do {
|
||||
activeCounts[i].incrementCount(globalOrd, inc);
|
||||
} while (i-- > 0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is designed to count over per-segment term ords ({@link SegCountGlobal} provides equivalent functionality for
|
||||
* global term ords).
|
||||
*
|
||||
* @see SegCountGlobal
|
||||
*/
|
||||
static class SegCountPerSeg implements SegCounter {
|
||||
protected final int[][] allSegCounts;
|
||||
private final int[][] activeSegCounts;
|
||||
private final boolean[] seen;
|
||||
|
||||
public SegCountPerSeg(int[][] allSegCounts, boolean[] seen, int segMax, int size) {
|
||||
this.allSegCounts = allSegCounts;
|
||||
this.activeSegCounts = Arrays.copyOf(this.allSegCounts, size);
|
||||
this.seen = seen;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void map(int allIdx, int activeIdx) {
|
||||
activeSegCounts[activeIdx] = allSegCounts[allIdx];
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void incrementCount(int segOrd, int inc, int maxIdx) {
|
||||
seen[segOrd] = true;
|
||||
int i = maxIdx;
|
||||
do {
|
||||
activeSegCounts[i][segOrd] += inc;
|
||||
} while (i-- > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Maps accumulated per-segment term ords to global term ords and increments global slots on the specified countAccs
|
||||
* accordingly. The index of each CountSlotAcc in the specified countAccs array must correspond to the
|
||||
* the static index of its associated count accumulation doc domain and per-seg count array.
|
||||
*
|
||||
* @param countAccs - global-scope CountSlotAccs (one for each domain) to be incremented for the most recently accumulated
|
||||
* segment
|
||||
* @param toGlobal - mapping of per-segment term ords to global term ords for the most recently accumulated segment
|
||||
* @param maxSegOrd - the max per-seg term ord for the most recently accumulated segment
|
||||
*/
|
||||
public void register(CountSlotAcc[] countAccs, LongValues toGlobal, int maxSegOrd) {
|
||||
int segOrd = maxSegOrd;
|
||||
final int maxIdx = countAccs.length - 1;
|
||||
for (;;) {
|
||||
if (seen[segOrd]) {
|
||||
int i = maxIdx;
|
||||
int slot = toGlobal == null ? segOrd : (int)toGlobal.get(segOrd);
|
||||
do {
|
||||
final int inc = allSegCounts[i][segOrd];
|
||||
if (inc > 0) {
|
||||
countAccs[i].incrementCount(slot, inc);
|
||||
}
|
||||
} while (i-- > 0);
|
||||
}
|
||||
if (--segOrd < 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
|
||||
|
||||
public abstract class SweepDISI extends DocIdSetIterator implements SweepCountAware {
|
||||
|
||||
public final int size;
|
||||
final CountSlotAcc[] countAccs;
|
||||
|
||||
public SweepDISI(int size, CountSlotAcc[] countAccs) {
|
||||
this.size = size;
|
||||
this.countAccs = countAccs;
|
||||
}
|
||||
|
||||
private static boolean addAcc(SweepCountAccStruct entry, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx, int idx) throws IOException {
|
||||
final DocIdSet docIdSet = entry.docSet.getTopFilter().getDocIdSet(subCtx, null);
|
||||
if (docIdSet == null || (subIterators[idx] = docIdSet.iterator()) == null) {
|
||||
return false;
|
||||
}
|
||||
activeCountAccs[idx] = entry.countAcc;
|
||||
return true;
|
||||
}
|
||||
|
||||
static SweepDISI newInstance(SweepCountAccStruct base, List<SweepCountAccStruct> others, DocIdSetIterator[] subIterators, CountSlotAcc[] activeCountAccs, LeafReaderContext subCtx) throws IOException {
|
||||
int activeCt = 0;
|
||||
final int baseIdx;
|
||||
if (base == null || !addAcc(base, subIterators, activeCountAccs, subCtx, activeCt)) {
|
||||
baseIdx = -1;
|
||||
} else {
|
||||
baseIdx = activeCt++;
|
||||
}
|
||||
for (SweepCountAccStruct entry : others) {
|
||||
if (addAcc(entry, subIterators, activeCountAccs, subCtx, activeCt)) {
|
||||
activeCt++;
|
||||
}
|
||||
}
|
||||
switch (activeCt) {
|
||||
case 0:
|
||||
return null;
|
||||
case 1:
|
||||
return new SingletonDISI(subIterators[0], activeCountAccs, baseIdx >= 0); // solr docsets already exclude any deleted docs
|
||||
default:
|
||||
return new UnionDISI(subIterators, activeCountAccs, activeCt, baseIdx);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
throw new UnsupportedOperationException("Not supported.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
throw new UnsupportedOperationException("Not supported.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
throw new UnsupportedOperationException("Not supported.");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
|
||||
|
||||
abstract class SweepDocIterator implements DocIterator, SweepCountAware {
|
||||
|
||||
public final int size;
|
||||
|
||||
public SweepDocIterator(int size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
static class SweepIteratorAndCounts {
|
||||
final SweepDocIterator iter;
|
||||
final CountSlotAcc[] countAccs;
|
||||
public SweepIteratorAndCounts(SweepDocIterator iter, CountSlotAcc[] countAccs) {
|
||||
this.iter = iter;
|
||||
this.countAccs = countAccs;
|
||||
}
|
||||
}
|
||||
|
||||
static SweepIteratorAndCounts newInstance(SweepCountAccStruct base, List<SweepCountAccStruct> others) throws IOException {
|
||||
final int activeCt;
|
||||
SweepCountAccStruct entry;
|
||||
if (base == null) {
|
||||
activeCt = others.size();
|
||||
entry = others.get(0);
|
||||
} else {
|
||||
activeCt = others.size() + 1;
|
||||
entry = base;
|
||||
}
|
||||
if (activeCt == 1) {
|
||||
final CountSlotAcc[] countAccs = new CountSlotAcc[] {entry.countAcc};
|
||||
return new SweepIteratorAndCounts(new SingletonDocIterator(entry.docSet.iterator(), base != null), countAccs);
|
||||
} else {
|
||||
final DocIterator[] subIterators = new DocIterator[activeCt];
|
||||
final CountSlotAcc[] countAccs = new CountSlotAcc[activeCt];
|
||||
Iterator<SweepCountAccStruct> othersIter = others.iterator();
|
||||
int i = 0;
|
||||
for (;;) {
|
||||
subIterators[i] = entry.docSet.iterator();
|
||||
countAccs[i] = entry.countAcc;
|
||||
if (++i == activeCt) {
|
||||
break;
|
||||
}
|
||||
entry = othersIter.next();
|
||||
}
|
||||
return new SweepIteratorAndCounts(new UnionDocIterator(subIterators, base == null ? -1 : 0), countAccs);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() {
|
||||
throw new UnsupportedOperationException("Not supported.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer next() {
|
||||
throw new UnsupportedOperationException("Not supported.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public abstract int registerCounts(SegCounter segCounts); // override to not throw IOException
|
||||
|
||||
}
|
|
@ -41,11 +41,15 @@ import org.apache.solr.index.SlowCompositeReaderWrapper;
|
|||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.TrieField;
|
||||
import org.apache.solr.search.BitDocSet;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.SolrCache;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
|
||||
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
|
||||
import org.apache.solr.search.facet.SlotAcc.SlotContext;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
|
||||
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
|
||||
import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts;
|
||||
import org.apache.solr.uninverting.DocTermOrds;
|
||||
import org.apache.solr.util.TestInjection;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -315,7 +319,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
|
||||
|
||||
|
||||
private void getCounts(FacetFieldProcessorByArrayUIF processor, SlotAcc.CountSlotAcc counts) throws IOException {
|
||||
private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException {
|
||||
DocSet docs = processor.fcontext.base;
|
||||
int baseSize = docs.size();
|
||||
int maxDoc = searcher.maxDoc();
|
||||
|
@ -325,9 +329,12 @@ public class UnInvertedField extends DocTermOrds {
|
|||
return;
|
||||
}
|
||||
|
||||
SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
|
||||
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
|
||||
|
||||
final int[] index = this.index;
|
||||
|
||||
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet;
|
||||
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet && baseCountAccStruct != null;
|
||||
|
||||
if (doNegative) {
|
||||
FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
|
||||
|
@ -337,21 +344,34 @@ public class UnInvertedField extends DocTermOrds {
|
|||
docs = new BitDocSet(bs, maxDoc - baseSize);
|
||||
// simply negating will mean that we have deleted docs in the set.
|
||||
// that should be OK, as their entries in our table should be empty.
|
||||
baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs);
|
||||
}
|
||||
|
||||
// For the biggest terms, do straight set intersections
|
||||
for (TopTerm tt : bigTerms.values()) {
|
||||
// TODO: counts could be deferred if sorting by index order
|
||||
counts.incrementCount(tt.termNum, searcher.numDocs(tt.termQuery, docs));
|
||||
final int termOrd = tt.termNum;
|
||||
Iterator<SweepCountAccStruct> othersIter = others.iterator();
|
||||
SweepCountAccStruct entry = baseCountAccStruct != null ? baseCountAccStruct : othersIter.next();
|
||||
for (;;) {
|
||||
entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet));
|
||||
if (!othersIter.hasNext()) {
|
||||
break;
|
||||
}
|
||||
entry = othersIter.next();
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we could short-circuit counting altogether for sorted faceting
|
||||
// where we already have enough terms from the bigTerms
|
||||
|
||||
if (termInstances > 0) {
|
||||
DocIterator iter = docs.iterator();
|
||||
final SweepIteratorAndCounts iterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
|
||||
final SweepDocIterator iter = iterAndCounts.iter;
|
||||
final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs);
|
||||
while (iter.hasNext()) {
|
||||
int doc = iter.nextDoc();
|
||||
int maxIdx = iter.registerCounts(counts);
|
||||
int code = index[doc];
|
||||
|
||||
if ((code & 0x80000000)!=0) {
|
||||
|
@ -368,7 +388,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
}
|
||||
if (delta == 0) break;
|
||||
tnum += delta - TNUM_OFFSET;
|
||||
counts.incrementCount(tnum,1);
|
||||
counts.incrementCount(tnum, 1, maxIdx);
|
||||
}
|
||||
} else {
|
||||
int tnum = 0;
|
||||
|
@ -378,7 +398,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
if ((code & 0x80) == 0) {
|
||||
if (delta == 0) break;
|
||||
tnum += delta - TNUM_OFFSET;
|
||||
counts.incrementCount(tnum,1);
|
||||
counts.incrementCount(tnum, 1, maxIdx);
|
||||
delta = 0;
|
||||
}
|
||||
code >>>= 8;
|
||||
|
@ -388,9 +408,10 @@ public class UnInvertedField extends DocTermOrds {
|
|||
}
|
||||
|
||||
if (doNegative) {
|
||||
final CountSlotAcc baseCounts = processor.countAcc;
|
||||
for (int i=0; i<numTermsInField; i++) {
|
||||
// counts[i] = maxTermCounts[i] - counts[i];
|
||||
counts.incrementCount(i, maxTermCounts[i] - (int) counts.getCount(i)*2);
|
||||
baseCounts.incrementCount(i, maxTermCounts[i] - (int) baseCounts.getCount(i)*2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -409,7 +430,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
|
||||
public void collectDocs(FacetFieldProcessorByArrayUIF processor) throws IOException {
|
||||
if (processor.collectAcc==null && processor.allBucketsAcc == null && processor.startTermIndex == 0 && processor.endTermIndex >= numTermsInField) {
|
||||
getCounts(processor, processor.countAcc);
|
||||
getCounts(processor);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -427,15 +448,22 @@ public class UnInvertedField extends DocTermOrds {
|
|||
DocSet docs = processor.fcontext.base;
|
||||
|
||||
int uniqueTerms = 0;
|
||||
final SlotAcc.CountSlotAcc countAcc = processor.countAcc;
|
||||
final CountSlotAcc countAcc = processor.countAcc;
|
||||
final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
|
||||
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
|
||||
|
||||
for (TopTerm tt : bigTerms.values()) {
|
||||
if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
|
||||
// handle the biggest terms
|
||||
DocSet intersection = searcher.getDocSet(tt.termQuery, docs);
|
||||
DocSet termSet = searcher.getDocSet(tt.termQuery);
|
||||
DocSet intersection = termSet.intersection(docs);
|
||||
int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex,
|
||||
slotNum -> { return new SlotContext(tt.termQuery); });
|
||||
countAcc.incrementCount(tt.termNum - startTermIndex, collected);
|
||||
final int termOrd = tt.termNum - startTermIndex;
|
||||
countAcc.incrementCount(termOrd, collected);
|
||||
for (SweepCountAccStruct entry : others) {
|
||||
entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet));
|
||||
}
|
||||
if (collected > 0) {
|
||||
uniqueTerms++;
|
||||
}
|
||||
|
@ -455,9 +483,14 @@ public class UnInvertedField extends DocTermOrds {
|
|||
|
||||
// TODO: handle facet.prefix here!!!
|
||||
|
||||
DocIterator iter = docs.iterator();
|
||||
SweepIteratorAndCounts sweepIterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
|
||||
final SweepDocIterator iter = sweepIterAndCounts.iter;
|
||||
final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs;
|
||||
final SegCountGlobal counts = new SegCountGlobal(countAccs);
|
||||
while (iter.hasNext()) {
|
||||
int doc = iter.nextDoc();
|
||||
int maxIdx = iter.registerCounts(counts);
|
||||
boolean collectBase = iter.collectBase();
|
||||
|
||||
if (doc >= adjustedMax) {
|
||||
do {
|
||||
|
@ -495,9 +528,11 @@ public class UnInvertedField extends DocTermOrds {
|
|||
int arrIdx = tnum - startTermIndex;
|
||||
if (arrIdx < 0) continue;
|
||||
if (arrIdx >= nTerms) break;
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
counts.incrementCount(arrIdx, 1, maxIdx);
|
||||
if (collectBase) {
|
||||
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
int tnum = 0;
|
||||
int delta = 0;
|
||||
|
@ -509,9 +544,11 @@ public class UnInvertedField extends DocTermOrds {
|
|||
int arrIdx = tnum - startTermIndex;
|
||||
if (arrIdx >= 0) {
|
||||
if (arrIdx >= nTerms) break;
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
counts.incrementCount(arrIdx, 1, maxIdx);
|
||||
if (collectBase) {
|
||||
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
|
||||
}
|
||||
}
|
||||
delta = 0;
|
||||
}
|
||||
code >>>= 8;
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
|
||||
|
||||
final class UnionDISI extends SweepDISI {
|
||||
|
||||
final int maxIdx;
|
||||
private final SubIterStruct baseSub;
|
||||
private boolean collectBase;
|
||||
private final PriorityQueue<SubIterStruct> queue;
|
||||
private SubIterStruct top;
|
||||
private int docId = -1;
|
||||
|
||||
private static final class SubIterStruct {
|
||||
private final DocIdSetIterator sub;
|
||||
private final int index;
|
||||
private int docId;
|
||||
public SubIterStruct(DocIdSetIterator sub, int index) throws IOException {
|
||||
this.sub = sub;
|
||||
this.index = index;
|
||||
nextDoc();
|
||||
}
|
||||
public void nextDoc() throws IOException {
|
||||
docId = sub.nextDoc();
|
||||
}
|
||||
}
|
||||
UnionDISI(DocIdSetIterator[] subIterators, CountSlotAcc[] countAccs, int size, int baseIdx) throws IOException {
|
||||
super(size, countAccs);
|
||||
this.maxIdx = size - 1;
|
||||
queue = new PriorityQueue<SubIterStruct>(size) {
|
||||
@Override
|
||||
protected boolean lessThan(SubIterStruct a, SubIterStruct b) {
|
||||
return a.docId < b.docId;
|
||||
}
|
||||
};
|
||||
int i = maxIdx;
|
||||
SubIterStruct tmpBaseSub = null;
|
||||
do {
|
||||
final SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i);
|
||||
queue.add(subIterStruct);
|
||||
if (i == baseIdx) {
|
||||
tmpBaseSub = subIterStruct;
|
||||
}
|
||||
} while (i-- > 0);
|
||||
baseSub = tmpBaseSub;
|
||||
top = queue.top();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (top.docId == docId) {
|
||||
do {
|
||||
top.nextDoc();
|
||||
} while ((top = queue.updateTop()).docId == docId);
|
||||
}
|
||||
if (baseSub != null) {
|
||||
collectBase = false;
|
||||
}
|
||||
return docId = top.docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collectBase() {
|
||||
assert top.docId != docId : "must call registerCounts() before collectBase()";
|
||||
return collectBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int registerCounts(SegCounter segCounter) throws IOException {
|
||||
int i = -1;
|
||||
do {
|
||||
if (!collectBase && top == baseSub) {
|
||||
collectBase = true;
|
||||
}
|
||||
segCounter.map(top.index, ++i);
|
||||
top.nextDoc();
|
||||
} while ((top = queue.updateTop()).docId == docId);
|
||||
return i;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.search.facet;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
final class UnionDocIterator extends SweepDocIterator {
|
||||
|
||||
private final int maxIdx;
|
||||
private final SubIterStruct baseSub;
|
||||
private boolean collectBase;
|
||||
private final PriorityQueue<SubIterStruct> queue;
|
||||
private SubIterStruct top;
|
||||
private int docId = -1;
|
||||
|
||||
private static final class SubIterStruct {
|
||||
private final DocIterator sub;
|
||||
private final int index;
|
||||
private int docId;
|
||||
public SubIterStruct(DocIterator sub, int index) throws IOException {
|
||||
this.sub = sub;
|
||||
this.index = index;
|
||||
nextDoc();
|
||||
}
|
||||
public void nextDoc() {
|
||||
docId = sub.hasNext() ? sub.nextDoc() : DocIdSetIterator.NO_MORE_DOCS;
|
||||
}
|
||||
}
|
||||
UnionDocIterator(DocIterator[] subIterators, int baseIdx) throws IOException {
|
||||
super(subIterators.length);
|
||||
this.maxIdx = size - 1;
|
||||
queue = new PriorityQueue<SubIterStruct>(size) {
|
||||
@Override
|
||||
protected boolean lessThan(SubIterStruct a, SubIterStruct b) {
|
||||
return a.docId < b.docId;
|
||||
}
|
||||
};
|
||||
SubIterStruct tmpBase = null;
|
||||
int i = maxIdx;
|
||||
do {
|
||||
SubIterStruct subIterStruct = new SubIterStruct(subIterators[i], i);
|
||||
queue.add(subIterStruct);
|
||||
if (i == baseIdx) {
|
||||
tmpBase = subIterStruct;
|
||||
}
|
||||
} while (i-- > 0);
|
||||
this.baseSub = tmpBase;
|
||||
top = queue.top();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() {
|
||||
if (top.docId == docId) {
|
||||
do {
|
||||
top.nextDoc();
|
||||
} while ((top = queue.updateTop()).docId == docId);
|
||||
}
|
||||
collectBase = false;
|
||||
return docId = top.docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (top.docId == docId) {
|
||||
do {
|
||||
top.nextDoc();
|
||||
} while ((top = queue.updateTop()).docId == docId);
|
||||
}
|
||||
return top.docId != DocIdSetIterator.NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collectBase() {
|
||||
assert top.docId != docId : "must call registerCounts() before collectBase()";
|
||||
return collectBase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int registerCounts(SegCounter segCounts) {
|
||||
int i = -1;
|
||||
do {
|
||||
if (!collectBase && top == baseSub) {
|
||||
collectBase = true;
|
||||
}
|
||||
segCounts.map(top.index, ++i);
|
||||
top.nextDoc();
|
||||
} while ((top = queue.updateTop()).docId == docId);
|
||||
return i;
|
||||
}
|
||||
}
|
|
@ -48,6 +48,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
|
|||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import static org.apache.solr.search.facet.FacetField.FacetMethod;
|
||||
import static org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc.SWEEP_COLLECTION_DEBUG_KEY;
|
||||
|
||||
import org.noggit.JSONUtil;
|
||||
import org.noggit.JSONWriter;
|
||||
|
@ -58,12 +59,11 @@ import org.junit.BeforeClass;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A randomized test of nested facets using the <code>relatedness()</code> function, that asserts the
|
||||
* results are consistent and equivilent regardless of what <code>method</code> (ie: FacetFieldProcessor)
|
||||
* is requested.
|
||||
* and/or <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> option is requested.
|
||||
* </p>
|
||||
* <p>
|
||||
* This test is based on {@link TestCloudJSONFacetSKG} but does <em>not</em>
|
||||
|
@ -281,6 +281,212 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanity check that our method of varying the <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> in conjunction with the
|
||||
* <code>method</code> params works and can be verified by inspecting the debug output of basic requests.
|
||||
*/
|
||||
public void testWhiteboxSanitySweepDebug() throws Exception {
|
||||
// NOTE: json.facet debugging output can be wonky, particularly when dealing with cloud
|
||||
// so for these queries we keep it simple:
|
||||
// - only one "top" facet per request
|
||||
// - no refinement
|
||||
// even with those constraints in place, a single facet can (may/sometimes?) produce multiple debug
|
||||
// blocks - aparently due to shard merging? So...
|
||||
// - only inspect the "first" debug NamedList in the results
|
||||
//
|
||||
|
||||
final SolrParams baseParams = params("rows","0",
|
||||
"debug","true", // SOLR-14451
|
||||
// *:* is the only "safe" query for this test,
|
||||
// to ensure we always have at least one bucket for every facet
|
||||
// so we can be confident in getting the debug we expect...
|
||||
"q", "*:*",
|
||||
"fore", multiStrField(7)+":11",
|
||||
"back", "*:*");
|
||||
|
||||
// simple individual facet that sorts on an skg stat...
|
||||
//
|
||||
// all results we test should be the same even if there is another 'skg_extra' stat,
|
||||
// it shouldn't be involved in the sweeping at all.
|
||||
for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
|
||||
// choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
|
||||
final TermFacet f = new TermFacet(soloStrField(9), 10, 0, "skg desc", null);
|
||||
if (null != extra) {
|
||||
f.subFacets.put("skg_extra", extra);
|
||||
}
|
||||
final Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
facets.put("str", f);
|
||||
|
||||
final SolrParams facetParams
|
||||
= SolrParams.wrapDefaults(params("method_val", "dv",
|
||||
"json.facet", Facet.toJSONFacetParamValue(facets)),
|
||||
baseParams);
|
||||
|
||||
// both default sweep option and explicit sweep should give same results...
|
||||
for (SolrParams sweepParams : Arrays.asList(params(),
|
||||
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
|
||||
"sweep_val", "true"))) {
|
||||
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
|
||||
|
||||
final NamedList<Object> debug = getFacetDebug(params);
|
||||
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
|
||||
@SuppressWarnings("unchecked")
|
||||
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
|
||||
assertNotNull(sweep_debug);
|
||||
assertEquals("count", sweep_debug.get("base"));
|
||||
assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs"));
|
||||
assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped"));
|
||||
}
|
||||
{ // 'dv' will always *try* to sweep, but disabling on stat should mean debug is mostly empty...
|
||||
final SolrParams params = SolrParams.wrapDefaults(params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
|
||||
"sweep_val", "false"),
|
||||
facetParams);
|
||||
final NamedList<Object> debug = getFacetDebug(params);
|
||||
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
|
||||
@SuppressWarnings("unchecked")
|
||||
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
|
||||
assertNotNull(sweep_debug);
|
||||
assertEquals("count", sweep_debug.get("base"));
|
||||
assertEquals(Collections.emptyList(), sweep_debug.get("accs"));
|
||||
assertEquals(Collections.emptyList(), sweep_debug.get("mapped"));
|
||||
}
|
||||
{ // if we override 'dv' with 'hashdv' which doesn't sweep, our sweep debug should be empty,
|
||||
// even if the skg stat does ask for sweeping explicitly...
|
||||
final SolrParams params = SolrParams.wrapDefaults(params("method_val", "dvhash",
|
||||
"sweep_key", RelatednessAgg.SWEEP_COLLECTION,
|
||||
"sweep_val", "true"),
|
||||
facetParams);
|
||||
final NamedList<Object> debug = getFacetDebug(params);
|
||||
assertEquals(FacetFieldProcessorByHashDV.class.getSimpleName(), debug.get("processor"));
|
||||
assertNull(debug.get(SWEEP_COLLECTION_DEBUG_KEY));
|
||||
}
|
||||
}
|
||||
|
||||
// simple facet that sorts on an skg stat but uses prelim_sort on count
|
||||
//
|
||||
// all results we test should be the same even if there is another 'skg_extra' stat,
|
||||
// neither skg should be involved in the sweeping at all.
|
||||
for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
|
||||
// choose a single value string so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
|
||||
final TermFacet f = new TermFacet(soloStrField(9), map("limit", 3, "overrequest", 0,
|
||||
"sort", "skg desc",
|
||||
"prelim_sort", "count asc"));
|
||||
if (null != extra) {
|
||||
f.subFacets.put("skg_extra", extra);
|
||||
}
|
||||
final Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
facets.put("str", f);
|
||||
|
||||
final SolrParams facetParams
|
||||
= SolrParams.wrapDefaults(params("method_val", "dv",
|
||||
"json.facet", Facet.toJSONFacetParamValue(facets)),
|
||||
baseParams);
|
||||
|
||||
// default sweep as well as any explicit sweep=true/false values should give same results: no sweeping
|
||||
for (SolrParams sweepParams : Arrays.asList(params(),
|
||||
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
|
||||
"sweep_val", "false"),
|
||||
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
|
||||
"sweep_val", "true"))) {
|
||||
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
|
||||
|
||||
final NamedList<Object> debug = getFacetDebug(params);
|
||||
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
|
||||
@SuppressWarnings("unchecked")
|
||||
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
|
||||
assertNotNull(sweep_debug);
|
||||
assertEquals("count", sweep_debug.get("base"));
|
||||
assertEquals(Collections.emptyList(), sweep_debug.get("accs"));
|
||||
assertEquals(Collections.emptyList(), sweep_debug.get("mapped"));
|
||||
}
|
||||
}
|
||||
|
||||
{ // single facet with infinite limit + multiple skgs...
|
||||
// this should trigger MultiAcc collection, causing sweeping on both skg functions
|
||||
//
|
||||
// all results we test should be the same even if there is another 'min' stat,
|
||||
// in each term facet. it shouldn't affect the sweeping/MultiAcc at all.
|
||||
for (Facet extra : Arrays.asList(null, new SumFacet(multiIntField(2)))) {
|
||||
final Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
final TermFacet facet = new TermFacet(soloStrField(9), -1, 0, "skg2 desc", null);
|
||||
facet.subFacets.put("skg2", new RelatednessFacet(multiStrField(2)+":9", null));
|
||||
if (null != extra) {
|
||||
facet.subFacets.put("sum", extra);
|
||||
}
|
||||
facets.put("str", facet);
|
||||
final SolrParams facetParams
|
||||
= SolrParams.wrapDefaults(params("method_val", "dv",
|
||||
"json.facet", Facet.toJSONFacetParamValue(facets)),
|
||||
baseParams);
|
||||
|
||||
// both default sweep option and explicit sweep should give same results...
|
||||
for (SolrParams sweepParams : Arrays.asList(params(),
|
||||
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
|
||||
"sweep_val", "true"))) {
|
||||
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
|
||||
|
||||
final NamedList<Object> debug = getFacetDebug(params);
|
||||
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
|
||||
@SuppressWarnings("unchecked")
|
||||
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
|
||||
assertNotNull(sweep_debug);
|
||||
assertEquals("count", sweep_debug.get("base"));
|
||||
assertEquals(Arrays.asList("skg!fg","skg!bg","skg2!fg","skg2!bg"), sweep_debug.get("accs"));
|
||||
assertEquals(Arrays.asList("skg","skg2"), sweep_debug.get("mapped"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nested facets that both sort on an skg stat
|
||||
// (set limit + overrequest tiny to keep multishard response managable)
|
||||
//
|
||||
// all results we test should be the same even if there is another 'skg_extra' stat,
|
||||
// in each term facet. they shouldn't be involved in the sweeping at all.
|
||||
for (Facet extra : Arrays.asList(null, new RelatednessFacet(multiStrField(2)+":9", null))) {
|
||||
// choose single value strings so we know both 'dv' (sweep) and 'dvhash' (no sweep) can be specified
|
||||
// choose 'id' for the parent facet so we are garunteed some child facets
|
||||
final TermFacet parent = new TermFacet("id", 1, 0, "skg desc", false);
|
||||
final TermFacet child = new TermFacet(soloStrField(7), 1, 0, "skg desc", false);
|
||||
parent.subFacets.put("child", child);
|
||||
if (null != extra) {
|
||||
parent.subFacets.put("skg_extra", extra);
|
||||
child.subFacets.put("skg_extra", extra);
|
||||
}
|
||||
final Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
facets.put("parent", parent);
|
||||
|
||||
final SolrParams facetParams
|
||||
= SolrParams.wrapDefaults(params("method_val", "dv",
|
||||
"json.facet", Facet.toJSONFacetParamValue(facets)),
|
||||
baseParams);
|
||||
// both default sweep option and explicit sweep should give same results...
|
||||
for (SolrParams sweepParams : Arrays.asList(params(),
|
||||
params("sweep_key", RelatednessAgg.SWEEP_COLLECTION,
|
||||
"sweep_val", "true"))) {
|
||||
final SolrParams params = SolrParams.wrapDefaults(sweepParams, facetParams);
|
||||
|
||||
final NamedList<Object> parentDebug = getFacetDebug(params);
|
||||
assertEquals("id", parentDebug.get("field"));
|
||||
assertNotNull(parentDebug.get("sub-facet"));
|
||||
// may be multiples from diff shards, just use first one
|
||||
@SuppressWarnings("unchecked")
|
||||
final NamedList<Object> childDebug = ((List<NamedList<Object>>)parentDebug.get("sub-facet")).get(0);
|
||||
assertEquals(soloStrField(7), childDebug.get("field"));
|
||||
|
||||
// these should all be true for both the parent and the child debug..
|
||||
for (NamedList<Object> debug : Arrays.asList(parentDebug, childDebug)) {
|
||||
assertEquals(FacetFieldProcessorByArrayDV.class.getSimpleName(), debug.get("processor"));
|
||||
@SuppressWarnings("unchecked")
|
||||
final NamedList<Object> sweep_debug = (NamedList<Object>) debug.get(SWEEP_COLLECTION_DEBUG_KEY);
|
||||
assertNotNull(sweep_debug);
|
||||
assertEquals("count", sweep_debug.get("base"));
|
||||
assertEquals(Arrays.asList("skg!fg","skg!bg"), sweep_debug.get("accs"));
|
||||
assertEquals(Arrays.asList("skg"), sweep_debug.get("mapped"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the <b>FIRST</b> NamedList (under the implicit 'null' FacetQuery) in the "facet-trace" output
|
||||
* of the request. Should not be used with multiple "top level" facets
|
||||
|
@ -358,7 +564,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
{ // multi-valued facet field w/infinite limit and an extra (non-SKG) stat
|
||||
{ // multi-valued facet field w/infinite limit and an extra (non-SKG / non-sweeping) stat
|
||||
final TermFacet xxx = new TermFacet(multiStrField(12), -1, 0, "count asc", false);
|
||||
xxx.subFacets.put("sum", new SumFacet(multiIntField(4)));
|
||||
final Map<String,TermFacet> facets = new LinkedHashMap<>();
|
||||
|
@ -414,7 +620,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
for (int limit : Arrays.asList(10, -1)) {
|
||||
for (String sort : Arrays.asList("count desc", "skg desc", "index asc")) {
|
||||
for (Boolean refine : Arrays.asList(false, true)) {
|
||||
{ // 1 additional (non-SKG) stat
|
||||
{ // 1 additional (non-SKG / non-sweeping) stat
|
||||
final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit,
|
||||
"overrequest", 0,
|
||||
"sort", sort,
|
||||
|
@ -440,7 +646,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
multiStrField(0) + ":46"),
|
||||
multiStrField(5)+":9", "*:*");
|
||||
}
|
||||
{ // multiple SKGs and a multiple non-SKG stats
|
||||
{ // multiple SKGs and a multiple non-SKG / non-sweeping stats
|
||||
final TermFacet xxx = new TermFacet(facetFieldName, map("limit", limit,
|
||||
"overrequest", 0,
|
||||
"sort", sort,
|
||||
|
@ -504,6 +710,8 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
/**
|
||||
* Given a set of term facets, and top level query strings, asserts that
|
||||
* the results of these queries are identical even when varying the <code>method_val</code> param
|
||||
* and when varying the <code>{@value RelatednessAgg#SWEEP_COLLECTION}</code> param; either by explicitly setting to
|
||||
* <code>true</code> or <code>false</code> or by changing the param key to not set it at all.
|
||||
*/
|
||||
private void assertFacetSKGsAreConsistent(final Map<String,TermFacet> facets,
|
||||
final String query,
|
||||
|
@ -520,9 +728,14 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
@SuppressWarnings({"rawtypes"})
|
||||
final NamedList expected = getFacetResponse(basicParams);
|
||||
|
||||
// now loop over all processors and compare them to the "default"...
|
||||
// now loop over all permutations of processors and sweep values and and compare them to the "default"...
|
||||
for (FacetMethod method : EnumSet.allOf(FacetMethod.class)) {
|
||||
ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT));
|
||||
for (Boolean sweep : Arrays.asList(true, false, null)) {
|
||||
final ModifiableSolrParams options = params("method_val", method.toString().toLowerCase(Locale.ROOT));
|
||||
if (null != sweep) {
|
||||
options.add("sweep_key", RelatednessAgg.SWEEP_COLLECTION);
|
||||
options.add("sweep_val", sweep.toString());
|
||||
}
|
||||
|
||||
@SuppressWarnings({"rawtypes"})
|
||||
final NamedList actual = getFacetResponse(SolrParams.wrapAppended(options, basicParams));
|
||||
|
@ -543,6 +756,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
fail("Mismatch: " + pathToMismatch + " using " + options);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (AssertionError e) {
|
||||
throw new AssertionError(basicParams + " ===> " + e.getMessage(), e);
|
||||
} finally {
|
||||
|
@ -617,6 +831,10 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
* unless they are 'null' in which case <code>$fore</code> and <code>$back</code> refs will be used
|
||||
* in their place, and must be set as request params (this allows "random" facets to still easily
|
||||
* trigger the "nested facets re-using the same fore/back set for SKG situation)
|
||||
*
|
||||
* The JSON for all of these facets includes a <code>${sweep_key:xxx}</code> (which will be ignored
|
||||
* by default) and <code>${sweep_val:yyy}</code> which may be set as params on each request to override the
|
||||
* implicit default sweeping behavior of the underlying SKGAcc.
|
||||
*/
|
||||
private static final class RelatednessFacet implements Facet, Writable {
|
||||
public final Map<String,Object> jsonData = new LinkedHashMap<>();
|
||||
|
@ -641,7 +859,7 @@ public class TestCloudJSONFacetSKGEquiv extends SolrCloudTestCase {
|
|||
// we don't allow these to be overridden by options, so set them now...
|
||||
jsonData.put("type", "func");
|
||||
jsonData.put("func", "relatedness("+f+","+b+")");
|
||||
|
||||
jsonData.put("${sweep_key:xxx}","${sweep_val:yyy}");
|
||||
}
|
||||
@Override
|
||||
public void write(JSONWriter writer) {
|
||||
|
|
|
@ -763,6 +763,54 @@ public class TestJsonFacets extends SolrTestCaseHS {
|
|||
+ " } } ] } }");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSKGSweepMultiAcc() throws Exception {
|
||||
Client client = Client.localClient();
|
||||
indexSimple(client);
|
||||
|
||||
// simple single level facet w/skg & trivial non-sweeping stat using various sorts & (re)sorting
|
||||
for (String sort : Arrays.asList("sort:'index asc'",
|
||||
"sort:'y desc'",
|
||||
"sort:'z desc'",
|
||||
"sort:'skg desc'",
|
||||
"prelim_sort:'count desc', sort:'index asc'",
|
||||
"prelim_sort:'count desc', sort:'y desc'",
|
||||
"prelim_sort:'count desc', sort:'z desc'",
|
||||
"prelim_sort:'count desc', sort:'skg desc'")) {
|
||||
// the relatedness score of each of our cat_s values is (conviniently) also alphabetical order,
|
||||
// (and the same order as 'sum(num_i) desc' & 'min(num_i) desc')
|
||||
//
|
||||
// So all of these re/sort options should produce identical output
|
||||
// - Testing "index" sort allows the randomized use of "stream" processor as default to be tested.
|
||||
// - Testing (re)sorts on other stats sanity checks code paths where relatedness() is a "defered" Agg
|
||||
|
||||
for (String sweep : Arrays.asList("true", "false")) {
|
||||
// results should be the same even if we disable sweeping...
|
||||
assertJQ(req("q", "cat_s:[* TO *]", "rows", "0",
|
||||
"fore", "where_s:NY", "back", "*:*",
|
||||
"json.facet", ""
|
||||
+ "{x: { type: terms, field: 'cat_s', "+sort+", limit:-1, "
|
||||
+ " facet: { skg: { type: 'func', func:'relatedness($fore,$back)', "
|
||||
+" "+RelatednessAgg.SWEEP_COLLECTION+": "+sweep+" },"
|
||||
+ " y:'sum(num_i)', "
|
||||
+" z:'min(num_i)' } } }")
|
||||
, "facets=={count:5, x:{ buckets:["
|
||||
+ " { val:'A', count:2, y:5.0, z:2, "
|
||||
+ " skg : { relatedness: 0.00554, "
|
||||
+ " foreground_popularity: 0.16667,"
|
||||
+ " background_popularity: 0.33333, },"
|
||||
+ " }, "
|
||||
+ " { val:'B', count:3, y:-3.0, z:-5, "
|
||||
+ " skg : { relatedness: 0.0, " // perfectly average and uncorrolated
|
||||
+ " foreground_popularity: 0.16667,"
|
||||
+ " background_popularity: 0.5 },"
|
||||
+ " } ] } } "
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testRepeatedNumerics() throws Exception {
|
||||
Client client = Client.localClient();
|
||||
|
|
|
@ -919,6 +919,10 @@ NOTE: While it's very common to define the Background Set as `\*:*`, or some oth
|
|||
|
||||
When using the extended `type:func` syntax for specifying a `relatedness()` aggregation, an optional `min_popularity` (float) option can be used to specify a lower bound on the `foreground_popularity` and `background_popularity` values, that must be met in order for the `relatedness` score to be valid -- If this `min_popularity` is not met, then the `relatedness` score will be `-Infinity`.
|
||||
|
||||
The default implementation for calculating `relatedness()` domain correlation depends on the type of facet being calculated. Generic domain correlation is calculated per-term, by selectively retrieving a DocSet for each bucket-associated query (consulting the `filterCache`) and calculating DocSet intersections with "foreground" and "background" sets. For term facets (especially over high-cardinality fields) this approach can lead to `filterCache` thrashing; accordingly, `relatedness()` over term facets defaults where possible to an approach that collects facet counts directly over all multiple domains in a single sweep (never touching the `filterCache`). It is possible to explicitly control this "single sweep" collection by setting the extended `type:func` syntax `sweep_collection` option to `true` (the default) or `false` (to disable sweep collection).
|
||||
|
||||
NOTE: Disabling sweep collection for `relatedness()` stats over low-cardinality fields may yield a performance benefit, provided the `filterCache` is sufficiently large to accommodate an entry for each value in the associated field without inducing thrashing for anticipated use patterns. A reasonable heuristic is that fields of cardinality less than 1,000 _may_ benefit from disabling sweep. This heuristic is _not_ used to determine default behavior, particularly because non-sweep collection can so easily induce `filterCache` thrashing, with system-wide detrimental effects.
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{ "type": "func",
|
||||
|
|
Loading…
Reference in New Issue