mirror of https://github.com/apache/lucene.git
SOLR-8222: optimize method=dv faceting for counts
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712608 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b5b4292afc
commit
2bbbc4d42d
|
@ -346,6 +346,12 @@ Optimizations
|
|||
|
||||
* SOLR-7983: Utils.toUTF8 uses larger buffer than necessary for holding UTF8 data. (shalin)
|
||||
|
||||
* SOLR-8222: JSON Facet API optimization to faceting by count on docvalue fields (or indexed fields
|
||||
with method=dv) when there are multiple hits expected for enoug buckets. For example, this
|
||||
more than doubled the performance of faceting 5M documents over a field with 1M unique values.
|
||||
(yonik)
|
||||
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -73,6 +73,9 @@ public class FacetField extends FacetRequest {
|
|||
FacetMethod method;
|
||||
int cacheDf; // 0 means "default", -1 means "never cache"
|
||||
|
||||
// experimental - force perSeg collection when using dv method, currently for testing purposes only.
|
||||
Boolean perSeg;
|
||||
|
||||
// TODO: put this somewhere more generic?
|
||||
public static enum SortDirection {
|
||||
asc(-1) ,
|
||||
|
@ -692,158 +695,6 @@ abstract class FacetFieldProcessorFCBase extends FacetFieldProcessor {
|
|||
}
|
||||
|
||||
|
||||
class FacetFieldProcessorDV extends FacetFieldProcessorFCBase {
|
||||
static boolean unwrap_singleValued_multiDv = true; // only set to false for test coverage
|
||||
|
||||
boolean multiValuedField;
|
||||
SortedSetDocValues si; // only used for term lookups (for both single and multi-valued)
|
||||
MultiDocValues.OrdinalMap ordinalMap = null; // maps per-segment ords to global ords
|
||||
|
||||
|
||||
public FacetFieldProcessorDV(FacetContext fcontext, FacetField freq, SchemaField sf) {
|
||||
super(fcontext, freq, sf);
|
||||
multiValuedField = sf.multiValued() || sf.getType().multiValuedFieldCache();
|
||||
}
|
||||
|
||||
protected BytesRef lookupOrd(int ord) throws IOException {
|
||||
return si.lookupOrd(ord);
|
||||
}
|
||||
|
||||
protected void findStartAndEndOrds() throws IOException {
|
||||
if (multiValuedField) {
|
||||
si = FieldUtil.getSortedSetDocValues(fcontext.qcontext, sf, null);
|
||||
if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
|
||||
ordinalMap = ((MultiDocValues.MultiSortedSetDocValues)si).mapping;
|
||||
}
|
||||
} else {
|
||||
SortedDocValues single = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null);
|
||||
si = DocValues.singleton(single); // multi-valued view
|
||||
if (single instanceof MultiDocValues.MultiSortedDocValues) {
|
||||
ordinalMap = ((MultiDocValues.MultiSortedDocValues)single).mapping;
|
||||
}
|
||||
}
|
||||
|
||||
if (si.getValueCount() >= Integer.MAX_VALUE) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Field has too many unique values. field=" + sf + " nterms= " + si.getValueCount());
|
||||
}
|
||||
|
||||
if (prefixRef != null) {
|
||||
startTermIndex = (int)si.lookupTerm(prefixRef.get());
|
||||
if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
|
||||
prefixRef.append(UnicodeUtil.BIG_TERM);
|
||||
endTermIndex = (int)si.lookupTerm(prefixRef.get());
|
||||
assert endTermIndex < 0;
|
||||
endTermIndex = -endTermIndex - 1;
|
||||
} else {
|
||||
startTermIndex = 0;
|
||||
endTermIndex = (int)si.getValueCount();
|
||||
}
|
||||
|
||||
nTerms = endTermIndex - startTermIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void collectDocs() throws IOException {
|
||||
if (nTerms <= 0 || fcontext.base.size() < effectiveMincount) { // TODO: what about allBuckets? missing bucket?
|
||||
return;
|
||||
}
|
||||
|
||||
final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
|
||||
Filter filter = fcontext.base.getTopFilter();
|
||||
|
||||
for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
|
||||
LeafReaderContext subCtx = leaves.get(subIdx);
|
||||
|
||||
setNextReaderFirstPhase(subCtx);
|
||||
|
||||
DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs
|
||||
DocIdSetIterator disi = dis.iterator();
|
||||
|
||||
SortedDocValues singleDv = null;
|
||||
SortedSetDocValues multiDv = null;
|
||||
if (multiValuedField) {
|
||||
// TODO: get sub from multi?
|
||||
multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
|
||||
if (multiDv == null) {
|
||||
multiDv = DocValues.emptySortedSet();
|
||||
}
|
||||
// some codecs may optimize SortedSet storage for single-valued fields
|
||||
// this will be null if this is not a wrapped single valued docvalues.
|
||||
if (unwrap_singleValued_multiDv) {
|
||||
singleDv = DocValues.unwrapSingleton(multiDv);
|
||||
}
|
||||
} else {
|
||||
singleDv = subCtx.reader().getSortedDocValues(sf.getName());
|
||||
if (singleDv == null) {
|
||||
singleDv = DocValues.emptySorted();
|
||||
}
|
||||
}
|
||||
|
||||
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
|
||||
|
||||
if (singleDv != null) {
|
||||
collectDocs(singleDv, disi, toGlobal);
|
||||
} else {
|
||||
collectDocs(multiDv, disi, toGlobal);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int segOrd = singleDv.getOrd(doc);
|
||||
if (segOrd < 0) continue;
|
||||
collect(doc, segOrd, toGlobal);
|
||||
}
|
||||
}
|
||||
|
||||
protected void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
multiDv.setDocument(doc);
|
||||
for(;;) {
|
||||
int segOrd = (int)multiDv.nextOrd();
|
||||
if (segOrd < 0) break;
|
||||
collect(doc, segOrd, toGlobal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException {
|
||||
int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd;
|
||||
|
||||
int arrIdx = ord - startTermIndex;
|
||||
if (arrIdx >= 0 && arrIdx < nTerms) {
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
if (collectAcc != null) {
|
||||
collectAcc.collect(doc, arrIdx);
|
||||
}
|
||||
if (allBucketsAcc != null) {
|
||||
allBucketsAcc.collect(doc, arrIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,292 @@
|
|||
package org.apache.solr.search.facet;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiDocValues;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.SortedSetDocValues;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LongValues;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.Filter;
|
||||
|
||||
class FacetFieldProcessorDV extends FacetFieldProcessorFCBase {
|
||||
static boolean unwrap_singleValued_multiDv = true; // only set to false for test coverage
|
||||
|
||||
boolean multiValuedField;
|
||||
SortedSetDocValues si; // only used for term lookups (for both single and multi-valued)
|
||||
MultiDocValues.OrdinalMap ordinalMap = null; // maps per-segment ords to global ords
|
||||
|
||||
|
||||
public FacetFieldProcessorDV(FacetContext fcontext, FacetField freq, SchemaField sf) {
|
||||
super(fcontext, freq, sf);
|
||||
multiValuedField = sf.multiValued() || sf.getType().multiValuedFieldCache();
|
||||
}
|
||||
|
||||
protected BytesRef lookupOrd(int ord) throws IOException {
|
||||
return si.lookupOrd(ord);
|
||||
}
|
||||
|
||||
protected void findStartAndEndOrds() throws IOException {
|
||||
if (multiValuedField) {
|
||||
si = FieldUtil.getSortedSetDocValues(fcontext.qcontext, sf, null);
|
||||
if (si instanceof MultiDocValues.MultiSortedSetDocValues) {
|
||||
ordinalMap = ((MultiDocValues.MultiSortedSetDocValues)si).mapping;
|
||||
}
|
||||
} else {
|
||||
SortedDocValues single = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null);
|
||||
si = DocValues.singleton(single); // multi-valued view
|
||||
if (single instanceof MultiDocValues.MultiSortedDocValues) {
|
||||
ordinalMap = ((MultiDocValues.MultiSortedDocValues)single).mapping;
|
||||
}
|
||||
}
|
||||
|
||||
if (si.getValueCount() >= Integer.MAX_VALUE) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Field has too many unique values. field=" + sf + " nterms= " + si.getValueCount());
|
||||
}
|
||||
|
||||
if (prefixRef != null) {
|
||||
startTermIndex = (int)si.lookupTerm(prefixRef.get());
|
||||
if (startTermIndex < 0) startTermIndex = -startTermIndex - 1;
|
||||
prefixRef.append(UnicodeUtil.BIG_TERM);
|
||||
endTermIndex = (int)si.lookupTerm(prefixRef.get());
|
||||
assert endTermIndex < 0;
|
||||
endTermIndex = -endTermIndex - 1;
|
||||
} else {
|
||||
startTermIndex = 0;
|
||||
endTermIndex = (int)si.getValueCount();
|
||||
}
|
||||
|
||||
nTerms = endTermIndex - startTermIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void collectDocs() throws IOException {
|
||||
int domainSize = fcontext.base.size();
|
||||
|
||||
if (nTerms <= 0 || domainSize < effectiveMincount) { // TODO: what about allBuckets? missing bucket?
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: refactor some of this logic into a base class
|
||||
boolean countOnly = collectAcc==null && allBucketsAcc==null;
|
||||
boolean fullRange = startTermIndex == 0 && endTermIndex == si.getValueCount();
|
||||
|
||||
// Are we expecting many hits per bucket?
|
||||
// FUTURE: pro-rate for nTerms?
|
||||
// FUTURE: better take into account number of values in multi-valued fields. This info is available for indexed fields.
|
||||
// FUTURE: take into account that bigger ord maps are more expensive than smaller ones
|
||||
// One test: 5M doc index, faceting on a single-valued field with almost 1M unique values, crossover point where global counting was slower
|
||||
// than per-segment counting was a domain of 658k docs. At that point, top 10 buckets had 6-7 matches each.
|
||||
// this was for heap docvalues produced by UninvertingReader
|
||||
// Since these values were randomly distributed, lets round our domain multiplier up to account for less random real world data.
|
||||
long domainMultiplier = multiValuedField ? 4L : 2L;
|
||||
boolean manyHitsPerBucket = domainSize * domainMultiplier > (si.getValueCount() + 3); // +3 to increase test coverage with small tests
|
||||
|
||||
// If we're only calculating counts, we're not prefixing, and we expect to collect many documents per unique value,
|
||||
// then collect per-segment before mapping to global ords at the end. This will save redundant seg->global ord mappings.
|
||||
// FUTURE: there are probably some other non "countOnly" cases where we can use this as well (i.e. those where
|
||||
// the docid is not used)
|
||||
boolean canDoPerSeg = countOnly && fullRange;
|
||||
boolean accumSeg = manyHitsPerBucket && canDoPerSeg;
|
||||
|
||||
if (freq.perSeg != null) accumSeg = canDoPerSeg && freq.perSeg; // internal - override perSeg heuristic
|
||||
|
||||
final List<LeafReaderContext> leaves = fcontext.searcher.getIndexReader().leaves();
|
||||
Filter filter = fcontext.base.getTopFilter();
|
||||
|
||||
for (int subIdx = 0; subIdx < leaves.size(); subIdx++) {
|
||||
LeafReaderContext subCtx = leaves.get(subIdx);
|
||||
|
||||
setNextReaderFirstPhase(subCtx);
|
||||
|
||||
DocIdSet dis = filter.getDocIdSet(subCtx, null); // solr docsets already exclude any deleted docs
|
||||
DocIdSetIterator disi = dis.iterator();
|
||||
|
||||
SortedDocValues singleDv = null;
|
||||
SortedSetDocValues multiDv = null;
|
||||
if (multiValuedField) {
|
||||
// TODO: get sub from multi?
|
||||
multiDv = subCtx.reader().getSortedSetDocValues(sf.getName());
|
||||
if (multiDv == null) {
|
||||
multiDv = DocValues.emptySortedSet();
|
||||
}
|
||||
// some codecs may optimize SortedSet storage for single-valued fields
|
||||
// this will be null if this is not a wrapped single valued docvalues.
|
||||
if (unwrap_singleValued_multiDv) {
|
||||
singleDv = DocValues.unwrapSingleton(multiDv);
|
||||
}
|
||||
} else {
|
||||
singleDv = subCtx.reader().getSortedDocValues(sf.getName());
|
||||
if (singleDv == null) {
|
||||
singleDv = DocValues.emptySorted();
|
||||
}
|
||||
}
|
||||
|
||||
LongValues toGlobal = ordinalMap == null ? null : ordinalMap.getGlobalOrds(subIdx);
|
||||
|
||||
if (singleDv != null) {
|
||||
if (accumSeg) {
|
||||
collectPerSeg(singleDv, disi, toGlobal);
|
||||
} else {
|
||||
if (canDoPerSeg && toGlobal != null) {
|
||||
collectCounts(singleDv, disi, toGlobal);
|
||||
} else {
|
||||
collectDocs(singleDv, disi, toGlobal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (accumSeg) {
|
||||
collectPerSeg(multiDv, disi, toGlobal);
|
||||
} else {
|
||||
if (canDoPerSeg && toGlobal != null) {
|
||||
collectCounts(multiDv, disi, toGlobal);
|
||||
} else {
|
||||
collectDocs(multiDv, disi, toGlobal);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reuse = null; // better GC
|
||||
}
|
||||
|
||||
private int[] reuse;
|
||||
private int[] getCountArr(int maxNeeded) {
|
||||
if (reuse == null) {
|
||||
// make the count array large enough for any segment
|
||||
// FUTURE: (optionally) directly use the array of the CountAcc for an optimized index..
|
||||
reuse = new int[(int) si.getValueCount() + 1];
|
||||
} else {
|
||||
Arrays.fill(reuse, 0, maxNeeded, 0);
|
||||
}
|
||||
return reuse;
|
||||
}
|
||||
|
||||
private void collectPerSeg(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int segMax = singleDv.getValueCount() + 1;
|
||||
final int[] counts = getCountArr( segMax );
|
||||
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
counts[ singleDv.getOrd(doc) + 1 ]++;
|
||||
}
|
||||
|
||||
for (int i=1; i<segMax; i++) {
|
||||
int segCount = counts[i];
|
||||
if (segCount > 0) {
|
||||
int slot = toGlobal == null ? (i - 1) : (int) toGlobal.get(i - 1);
|
||||
countAcc.incrementCount(slot, segCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void collectPerSeg(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int segMax = (int)multiDv.getValueCount();
|
||||
final int[] counts = getCountArr( segMax );
|
||||
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
multiDv.setDocument(doc);
|
||||
for(;;) {
|
||||
int segOrd = (int)multiDv.nextOrd();
|
||||
if (segOrd < 0) break;
|
||||
counts[segOrd]++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<segMax; i++) {
|
||||
int segCount = counts[i];
|
||||
if (segCount > 0) {
|
||||
int slot = toGlobal == null ? (i) : (int) toGlobal.get(i);
|
||||
countAcc.incrementCount(slot, segCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collectDocs(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int segOrd = singleDv.getOrd(doc);
|
||||
if (segOrd < 0) continue;
|
||||
collect(doc, segOrd, toGlobal);
|
||||
}
|
||||
}
|
||||
|
||||
private void collectCounts(SortedDocValues singleDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int segOrd = singleDv.getOrd(doc);
|
||||
if (segOrd < 0) continue;
|
||||
int ord = (int)toGlobal.get(segOrd);
|
||||
countAcc.incrementCount(ord, 1);
|
||||
}
|
||||
}
|
||||
|
||||
private void collectDocs(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
multiDv.setDocument(doc);
|
||||
for(;;) {
|
||||
int segOrd = (int)multiDv.nextOrd();
|
||||
if (segOrd < 0) break;
|
||||
collect(doc, segOrd, toGlobal);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collectCounts(SortedSetDocValues multiDv, DocIdSetIterator disi, LongValues toGlobal) throws IOException {
|
||||
int doc;
|
||||
while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
multiDv.setDocument(doc);
|
||||
for(;;) {
|
||||
int segOrd = (int)multiDv.nextOrd();
|
||||
if (segOrd < 0) break;
|
||||
int ord = (int)toGlobal.get(segOrd);
|
||||
countAcc.incrementCount(ord, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void collect(int doc, int segOrd, LongValues toGlobal) throws IOException {
|
||||
int ord = (toGlobal != null && segOrd >= 0) ? (int)toGlobal.get(segOrd) : segOrd;
|
||||
|
||||
int arrIdx = ord - startTermIndex;
|
||||
if (arrIdx >= 0 && arrIdx < nTerms) {
|
||||
countAcc.incrementCount(arrIdx, 1);
|
||||
if (collectAcc != null) {
|
||||
collectAcc.collect(doc, arrIdx);
|
||||
}
|
||||
if (allBucketsAcc != null) {
|
||||
allBucketsAcc.collect(doc, arrIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -532,6 +532,8 @@ class FacetFieldParser extends FacetParser<FacetField> {
|
|||
facet.method = FacetField.FacetMethod.fromString(getString(m, "method", null));
|
||||
facet.cacheDf = (int)getLong(m, "cacheDf", facet.cacheDf);
|
||||
|
||||
facet.perSeg = (Boolean)m.get("perSeg");
|
||||
|
||||
// facet.sort may depend on a facet stat...
|
||||
// should we be parsing / validating this here, or in the execution environment?
|
||||
Object o = m.get("facet");
|
||||
|
|
Loading…
Reference in New Issue