LUCENE-10585: Scrub copy/paste code in the facets module and attempt to simplify a bit (#915)

This commit is contained in:
Greg Miller 2022-05-29 01:26:51 -07:00 committed by GitHub
parent 3a80968ddf
commit 8db1e41fc0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 544 additions and 1006 deletions

View File

@ -75,7 +75,9 @@ New Features
Improvements
---------------------
(No changes)
* LUCENE-10585: Facet module code cleanup (copy/paste scrubbing, simplification and some very minor
optimization tweaks). (Greg Miller)
Optimizations
---------------------

View File

@ -51,9 +51,13 @@ public abstract class Facets {
/**
* Returns labels for topN dimensions and their topNChildren sorted by the number of
* hits/aggregated values that dimension matched; Results should be the same as calling getAllDims
* and then only using the first topNDims; Sub-classes may want to override this implementation
* with a more efficient one if they are able.
* hits/aggregated values that dimension matched. Results should be the same as calling getAllDims
* and then only using the first topNDims. Note that dims should be configured as requiring dim
* counts if using this functionality to ensure accurate counts are available (see: {@link
* FacetsConfig#setRequireDimCount(String, boolean)}).
*
* <p>Sub-classes may want to override this implementation with a more efficient one if they are
* able.
*/
public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException {
List<FacetResult> allResults = getAllDims(topNChildren);

View File

@ -0,0 +1,349 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.sortedset;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PrimitiveIterator;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.DimTree;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/** Base class for SSDV faceting implementations. */
abstract class AbstractSortedSetDocValueFacetCounts extends Facets {
private static final Comparator<FacetResult> FACET_RESULT_COMPARATOR =
new Comparator<>() {
@Override
public int compare(FacetResult a, FacetResult b) {
if (a.value.intValue() > b.value.intValue()) {
return -1;
} else if (b.value.intValue() > a.value.intValue()) {
return 1;
} else {
return a.dim.compareTo(b.dim);
}
}
};
final SortedSetDocValuesReaderState state;
final FacetsConfig stateConfig;
final SortedSetDocValues dv;
final String field;
AbstractSortedSetDocValueFacetCounts(SortedSetDocValuesReaderState state) throws IOException {
this.state = state;
this.field = state.getField();
this.stateConfig = state.getFacetsConfig();
this.dv = state.getDocValues();
}
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
validateTopN(topN);
TopChildrenForPath topChildrenForPath = getTopChildrenForPath(topN, dim, path);
return createFacetResult(topChildrenForPath, dim, path);
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
if (path.length != 1) {
throw new IllegalArgumentException("path must be length=1");
}
int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (ord < 0) {
return -1;
}
return getCount(ord);
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
validateTopN(topN);
List<FacetResult> results = new ArrayList<>();
for (String dim : state.getDims()) {
TopChildrenForPath topChildrenForPath = getTopChildrenForPath(topN, dim);
FacetResult facetResult = createFacetResult(topChildrenForPath, dim);
if (facetResult != null) {
results.add(facetResult);
}
}
// Sort by highest count:
results.sort(FACET_RESULT_COMPARATOR);
return results;
}
@Override
public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException {
validateTopN(topNDims);
validateTopN(topNChildren);
// Creates priority queue to store top dimensions and sort by their aggregated values/hits and
// string values.
PriorityQueue<DimValue> pq =
new PriorityQueue<>(topNDims) {
@Override
protected boolean lessThan(DimValue a, DimValue b) {
if (a.value > b.value) {
return false;
} else if (a.value < b.value) {
return true;
} else {
return a.dim.compareTo(b.dim) > 0;
}
}
};
// Keep track of intermediate results, if we compute them, so we can reuse them later:
Map<String, TopChildrenForPath> intermediateResults = null;
for (String dim : state.getDims()) {
DimConfig dimConfig = stateConfig.getDimConfig(dim);
int dimCount;
if (dimConfig.hierarchical) {
// For hierarchical dims, we directly index each level of the ancestry path (i.e., we
// "rollup" at indexing time), meaning we can directly access accurate dim counts without
// needing to rollup the descendant paths:
int dimOrd = state.getDimTree(dim).dimStartOrd;
dimCount = getCount(dimOrd);
} else {
OrdRange ordRange = state.getOrdRange(dim);
int dimOrd = ordRange.start;
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
// If a dim is configured as multi-valued and requires dim count, we index dim counts
// directly, so we can access accurate counts without needing to rollup children:
dimCount = getCount(dimOrd);
} else {
// If a dim is configured as multi-valued but _not_ requiring dim count, we have no
// way to get accurate counts. We use -1 to indicate this:
dimCount = -1;
}
} else {
// If a dim is single-valued, we must aggregate child counts to get accurate dim counts.
// We don't index the dim counts directly:
// TODO: If getTopDims becomes a common use-case, we could consider always indexing dim
// counts to optimize this path.
PrimitiveIterator.OfInt childIt = ordRange.iterator();
TopChildrenForPath topChildrenForPath =
computeTopChildren(childIt, topNChildren, dimConfig, dimOrd);
if (intermediateResults == null) {
intermediateResults = new HashMap<>();
}
intermediateResults.put(dim, topChildrenForPath);
dimCount = topChildrenForPath.pathCount;
}
}
if (dimCount != 0) {
if (pq.size() < topNDims) {
pq.add(new DimValue(dim, dimCount));
} else {
if (dimCount > pq.top().value
|| (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValue bottomDim = pq.top();
bottomDim.dim = dim;
bottomDim.value = dimCount;
pq.updateTop();
}
}
}
}
int resultSize = pq.size();
FacetResult[] results = new FacetResult[resultSize];
while (pq.size() > 0) {
DimValue dimValue = pq.pop();
assert dimValue != null;
TopChildrenForPath topChildrenForPath = null;
if (intermediateResults != null) {
topChildrenForPath = intermediateResults.get(dimValue.dim);
}
if (topChildrenForPath == null) {
topChildrenForPath = getTopChildrenForPath(topNChildren, dimValue.dim);
}
FacetResult facetResult = createFacetResult(topChildrenForPath, dimValue.dim);
// should not be null since only dims with non-zero values were considered earlier
assert facetResult != null;
resultSize--;
results[resultSize] = facetResult;
}
return Arrays.asList(results);
}
/** Retrieve the count for a specified ordinal. */
abstract int getCount(int ord);
/**
* Compute the top-n children for the given path and iterator of all immediate children of the
* path. This returns an intermediate result that does the minimal required work, avoiding the
* cost of looking up string labels, etc.
*/
TopChildrenForPath computeTopChildren(
PrimitiveIterator.OfInt childOrds, int topN, DimConfig dimConfig, int pathOrd) {
TopOrdAndIntQueue q = null;
int bottomCount = 0;
int pathCount = 0;
int childCount = 0;
TopOrdAndIntQueue.OrdAndValue reuse = null;
while (childOrds.hasNext()) {
int ord = childOrds.next();
int count = getCount(ord);
if (count > 0) {
pathCount += count;
childCount++;
if (count > bottomCount) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
if (q == null) {
// Lazy init, so we don't create this for the
// sparse case unnecessarily
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
}
}
}
}
if (dimConfig.hierarchical) {
pathCount = getCount(pathOrd);
} else {
// see if pathCount is actually reliable or needs to be reset
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
pathCount = getCount(pathOrd);
} else {
pathCount = -1; // pathCount is inaccurate at this point, so set it to -1
}
}
}
return new TopChildrenForPath(pathCount, childCount, q);
}
/**
* Determine the top-n children for a specified dimension + path. Results are in an intermediate
* form.
*/
TopChildrenForPath getTopChildrenForPath(int topN, String dim, String... path)
throws IOException {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
// Determine the path ord and resolve an iterator to its immediate children. The logic for this
// depends on whether-or-not the dimension is configured as hierarchical:
final int pathOrd;
final PrimitiveIterator.OfInt childIterator;
if (dimConfig.hierarchical) {
DimTree dimTree = state.getDimTree(dim);
if (path.length > 0) {
pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
} else {
// If there's no path, this is a little more efficient to just look up the dim:
pathOrd = dimTree.dimStartOrd;
}
if (pathOrd < 0) {
// path was never indexed
return null;
}
childIterator = dimTree.iterator(pathOrd);
} else {
if (path.length > 0) {
throw new IllegalArgumentException(
"Field is not configured as hierarchical, path should be 0 length");
}
OrdRange ordRange = state.getOrdRange(dim);
if (ordRange == null) {
// means dimension was never indexed
return null;
}
pathOrd = ordRange.start;
childIterator = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIterator.next();
}
}
// Compute the actual results:
return computeTopChildren(childIterator, topN, dimConfig, pathOrd);
}
/**
* Create a FacetResult for the provided dim + path and intermediate results. Does the extra work
* of resolving ordinals -> labels, etc. Will return null if there are no children.
*/
FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path)
throws IOException {
// If the intermediate result is null or there are no children, we return null:
if (topChildrenForPath == null || topChildrenForPath.childCount == 0) {
return null;
}
TopOrdAndIntQueue q = topChildrenForPath.q;
assert q != null;
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
}
return new FacetResult(
dim, path, topChildrenForPath.pathCount, labelValues, topChildrenForPath.childCount);
}
/** Intermediate result to store top children for a given path before resolving labels, etc. */
record TopChildrenForPath(int pathCount, int childCount, TopOrdAndIntQueue q) {}
static final class DimValue {
String dim;
int value;
DimValue(String dim, int value) {
this.dim = dim;
this.value = value;
}
}
}

View File

@ -19,25 +19,15 @@ package org.apache.lucene.facet.sortedset;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.PrimitiveIterator;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicIntegerArray;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetUtils;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
@ -52,27 +42,19 @@ import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.PriorityQueue;
/**
* Like {@link SortedSetDocValuesFacetCounts}, but aggregates counts concurrently across segments.
*
* @lucene.experimental
*/
public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
public class ConcurrentSortedSetDocValuesFacetCounts extends AbstractSortedSetDocValueFacetCounts {
final ExecutorService exec;
final SortedSetDocValuesReaderState state;
final FacetsConfig stateConfig;
final SortedSetDocValues dv;
final String field;
final AtomicIntegerArray counts;
private static final String[] emptyPath = new String[0];
/** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */
public ConcurrentSortedSetDocValuesFacetCounts(
SortedSetDocValuesReaderState state, ExecutorService exec)
@ -84,11 +66,8 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
public ConcurrentSortedSetDocValuesFacetCounts(
SortedSetDocValuesReaderState state, FacetsCollector hits, ExecutorService exec)
throws IOException, InterruptedException {
this.state = state;
this.field = state.getField();
this.stateConfig = state.getFacetsConfig();
super(state);
this.exec = exec;
dv = state.getDocValues();
counts = new AtomicIntegerArray(state.getSize());
if (hits == null) {
// browse only
@ -99,181 +78,8 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
validateTopN(topN);
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (pathOrd < 0) {
// path was never indexed
return null;
}
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
} else {
if (path.length > 0) {
throw new IllegalArgumentException(
"Field is not configured as hierarchical, path should be 0 length");
}
OrdRange ordRange = state.getOrdRange(dim);
if (ordRange == null) {
// means dimension was never indexed
return null;
}
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN);
}
}
/**
* Overloaded method to allow getPathResult be called without passing in the dimToChildOrdsResult
* parameter
*/
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
String[] path,
int pathOrd,
PrimitiveIterator.OfInt childOrds,
int topN)
throws IOException {
return getPathResult(dimConfig, dim, path, pathOrd, childOrds, topN, null);
}
/** Returns path results for a dimension */
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
String[] path,
int pathOrd,
PrimitiveIterator.OfInt childOrds,
int topN,
ChildOrdsResult dimToChildOrdsResult)
throws IOException {
ChildOrdsResult childOrdsResult;
// if getTopDims is called, get results from previously stored dimToChildOrdsResult, otherwise
// call getChildOrdsResult to get dimCount, childCount and the queue for the dimension's top
// children
if (dimToChildOrdsResult != null) {
childOrdsResult = dimToChildOrdsResult;
} else {
childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, pathOrd);
}
if (childOrdsResult.q == null) {
return null;
}
LabelAndValue[] labelValues = getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q);
if (dimConfig.hierarchical == true) {
return new FacetResult(
dim, path, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount);
} else {
return new FacetResult(
dim, emptyPath, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount);
}
}
/**
* Returns ChildOrdsResult that contains results of dimCount, childCount, and the queue for the
* dimension's top children to populate FacetResult in getPathResult.
*/
private ChildOrdsResult getChildOrdsResult(
PrimitiveIterator.OfInt childOrds, int topN, FacetsConfig.DimConfig dimConfig, int pathOrd) {
TopOrdAndIntQueue q = null;
int bottomCount = 0;
int dimCount = 0;
int childCount = 0;
TopOrdAndIntQueue.OrdAndValue reuse = null;
while (childOrds.hasNext()) {
int ord = childOrds.next();
if (counts.get(ord) > 0) {
dimCount += counts.get(ord);
childCount++;
if (counts.get(ord) > bottomCount) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = counts.get(ord);
if (q == null) {
// Lazy init, so we don't create this for the
// sparse case unnecessarily
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
}
}
}
}
if (dimConfig.hierarchical == true) {
dimCount = counts.get(pathOrd);
} else {
// see if dimCount is actually reliable or needs to be reset
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
dimCount = counts.get(pathOrd);
} else {
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
}
}
}
return new ChildOrdsResult(dimCount, childCount, q);
}
/** Returns label values for dims. */
private LabelAndValue[] getLabelValuesFromTopOrdAndIntQueue(TopOrdAndIntQueue q)
throws IOException {
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
}
return labelValues;
}
/** Returns value/count of a dimension. */
private int getDimValue(
FacetsConfig.DimConfig dimConfig,
String dim,
int dimOrd,
PrimitiveIterator.OfInt childOrds,
int topN,
HashMap<String, ChildOrdsResult> dimToChildOrdsResult) {
// if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at
// indexing time, return dimCount directly
if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) {
return counts.get(dimOrd);
}
// if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount
ChildOrdsResult childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, dimOrd);
// if no early termination, store dim and childOrdsResult into a hashmap to avoid calling
// getChildOrdsResult again in getPathResult
dimToChildOrdsResult.put(dim, childOrdsResult);
return childOrdsResult.dimCount;
int getCount(int ord) {
return counts.get(ord);
}
private class CountOneSegment implements Callable<Void> {
@ -535,195 +341,4 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
}
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
if (path.length != 1) {
throw new IllegalArgumentException("path must be length=1");
}
int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (ord < 0) {
return -1;
}
return counts.get(ord);
}
/**
* Overloaded method to allow getFacetResultForDim be called without passing in the
* dimToChildOrdsResult parameter
*/
private FacetResult getFacetResultForDim(String dim, int topNChildren) throws IOException {
return getFacetResultForDim(dim, topNChildren, null);
}
/** Returns FacetResult for a dimension. */
private FacetResult getFacetResultForDim(
String dim, int topNChildren, ChildOrdsResult dimToChildOrdsResult) throws IOException {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
int dimOrd = dimTree.dimStartOrd;
return getPathResult(
dimConfig,
dim,
emptyPath,
dimOrd,
dimTree.iterator(),
topNChildren,
dimToChildOrdsResult);
} else {
OrdRange ordRange = state.getOrdRange(dim);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
return getPathResult(
dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, dimToChildOrdsResult);
}
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
validateTopN(topN);
List<FacetResult> results = new ArrayList<>();
for (String dim : state.getDims()) {
FacetResult factResult = getFacetResultForDim(dim, topN);
if (factResult != null) {
results.add(factResult);
}
}
// Sort by highest count:
Collections.sort(
results,
new Comparator<FacetResult>() {
@Override
public int compare(FacetResult a, FacetResult b) {
if (a.value.intValue() > b.value.intValue()) {
return -1;
} else if (b.value.intValue() > a.value.intValue()) {
return 1;
} else {
return a.dim.compareTo(b.dim);
}
}
});
return results;
}
@Override
public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException {
if (topNDims <= 0 || topNChildren <= 0) {
throw new IllegalArgumentException("topN must be > 0");
}
// Creates priority queue to store top dimensions and sort by their aggregated values/hits and
// string values.
PriorityQueue<DimValueResult> pq =
new PriorityQueue<>(topNDims) {
@Override
protected boolean lessThan(DimValueResult a, DimValueResult b) {
if (a.value > b.value) {
return false;
} else if (a.value < b.value) {
return true;
} else {
return a.dim.compareTo(b.dim) > 0;
}
}
};
HashMap<String, ChildOrdsResult> dimToChildOrdsResult = new HashMap<>();
int dimCount;
for (String dim : state.getDims()) {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
int dimOrd = dimTree.dimStartOrd;
// get dim value
dimCount =
getDimValue(
dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, dimToChildOrdsResult);
} else {
OrdRange ordRange = state.getOrdRange(dim);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
dimCount = getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, dimToChildOrdsResult);
}
if (dimCount != 0) {
// use priority queue to store DimValueResult for topNDims
if (pq.size() < topNDims) {
pq.add(new DimValueResult(dim, dimCount));
} else {
if (dimCount > pq.top().value
|| (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValueResult bottomDim = pq.top();
bottomDim.dim = dim;
bottomDim.value = dimCount;
pq.updateTop();
}
}
}
}
// get FacetResult for topNDims
int resultSize = pq.size();
FacetResult[] results = new FacetResult[resultSize];
while (pq.size() > 0) {
DimValueResult dimValueResult = pq.pop();
FacetResult facetResult =
getFacetResultForDim(
dimValueResult.dim, topNChildren, dimToChildOrdsResult.get(dimValueResult.dim));
resultSize--;
results[resultSize] = facetResult;
}
return Arrays.asList(results);
}
/**
* Creates ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top
* children
*/
private static class ChildOrdsResult {
final int dimCount;
final int childCount;
final TopOrdAndIntQueue q;
ChildOrdsResult(int dimCount, int childCount, TopOrdAndIntQueue q) {
this.dimCount = dimCount;
this.childCount = childCount;
this.q = q;
}
}
/**
* Creates DimValueResult to store the label and value of dim in order to sort by these two
* fields.
*/
private static class DimValueResult {
String dim;
int value;
DimValueResult(String dim, int value) {
this.dim = dim;
this.value = value;
}
}
}

View File

@ -17,23 +17,12 @@
package org.apache.lucene.facet.sortedset;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.PrimitiveIterator;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetUtils;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.DimTree;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
@ -48,9 +37,7 @@ import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.PriorityQueue;
/**
* Compute facets counts from previously indexed {@link SortedSetDocValuesFacetField}, without
@ -70,16 +57,9 @@ import org.apache.lucene.util.PriorityQueue;
*
* @lucene.experimental
*/
public class SortedSetDocValuesFacetCounts extends Facets {
final SortedSetDocValuesReaderState state;
final FacetsConfig stateConfig;
final SortedSetDocValues dv;
final String field;
public class SortedSetDocValuesFacetCounts extends AbstractSortedSetDocValueFacetCounts {
final int[] counts;
private static final String[] emptyPath = new String[0];
/** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */
public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state) throws IOException {
this(state, null);
@ -88,10 +68,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
/** Counts all facet dimensions across the provided hits. */
public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, FacetsCollector hits)
throws IOException {
this.state = state;
this.field = state.getField();
this.stateConfig = state.getFacetsConfig();
this.dv = state.getDocValues();
super(state);
this.counts = new int[state.getSize()];
if (hits == null) {
// browse only
@ -102,180 +79,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
}
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
validateTopN(topN);
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (pathOrd < 0) {
// path was never indexed
return null;
}
DimTree dimTree = state.getDimTree(dim);
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
} else {
if (path.length > 0) {
throw new IllegalArgumentException(
"Field is not configured as hierarchical, path should be 0 length");
}
OrdRange ordRange = state.getOrdRange(dim);
if (ordRange == null) {
// means dimension was never indexed
return null;
}
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN);
}
}
/**
* Overloaded method to allow getPathResult be called without passing in the dimToChildOrdsResult
* parameter
*/
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
String[] path,
int pathOrd,
PrimitiveIterator.OfInt childOrds,
int topN)
throws IOException {
return getPathResult(dimConfig, dim, path, pathOrd, childOrds, topN, null);
}
/** Returns path results for a dimension */
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
String[] path,
int pathOrd,
PrimitiveIterator.OfInt childOrds,
int topN,
ChildOrdsResult dimToChildOrdsResult)
throws IOException {
ChildOrdsResult childOrdsResult;
// if getTopDims is called, get results from previously stored dimToChildOrdsResult, otherwise
// call getChildOrdsResult to get dimCount, childCount and the queue for the dimension's top
// children
if (dimToChildOrdsResult != null) {
childOrdsResult = dimToChildOrdsResult;
} else {
childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, pathOrd);
}
if (childOrdsResult.q == null) {
return null;
}
LabelAndValue[] labelValues = getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q);
if (dimConfig.hierarchical == true) {
return new FacetResult(
dim, path, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount);
} else {
return new FacetResult(
dim, emptyPath, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount);
}
}
/**
* Returns SortedSetDocValuesChildOrdsResult that contains results of dimCount, childCount, and
* the queue for the dimension's top children to populate FacetResult in getPathResult.
*/
private ChildOrdsResult getChildOrdsResult(
PrimitiveIterator.OfInt childOrds, int topN, FacetsConfig.DimConfig dimConfig, int pathOrd) {
TopOrdAndIntQueue q = null;
int bottomCount = 0;
int dimCount = 0;
int childCount = 0;
TopOrdAndIntQueue.OrdAndValue reuse = null;
while (childOrds.hasNext()) {
int ord = childOrds.next();
if (counts[ord] > 0) {
dimCount += counts[ord];
childCount++;
if (counts[ord] > bottomCount) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = counts[ord];
if (q == null) {
// Lazy init, so we don't create this for the
// sparse case unnecessarily
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
}
}
}
}
if (dimConfig.hierarchical == true) {
dimCount = counts[pathOrd];
} else {
// see if dimCount is actually reliable or needs to be reset
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
dimCount = counts[pathOrd];
} else {
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
}
}
}
return new ChildOrdsResult(dimCount, childCount, q);
}
/** Returns label values for dims. */
private LabelAndValue[] getLabelValuesFromTopOrdAndIntQueue(TopOrdAndIntQueue q)
throws IOException {
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
}
return labelValues;
}
/** Returns value/count of a dimension. */
private int getDimValue(
FacetsConfig.DimConfig dimConfig,
String dim,
int dimOrd,
PrimitiveIterator.OfInt childOrds,
int topN,
HashMap<String, ChildOrdsResult> dimToChildOrdsResult) {
// if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at
// indexing time, return dimCount directly
if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) {
return counts[dimOrd];
}
// if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount
ChildOrdsResult childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, dimOrd);
// if no early termination, store dim and childOrdsResult into a hashmap to avoid calling
// getChildOrdsResult again in getPathResult
dimToChildOrdsResult.put(dim, childOrdsResult);
return childOrdsResult.dimCount;
int getCount(int ord) {
return counts[ord];
}
// Variant of countOneSegment, that has No Hits or Live Docs
@ -507,193 +312,4 @@ public class SortedSetDocValuesFacetCounts extends Facets {
}
}
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
if (path.length != 1) {
throw new IllegalArgumentException("path must be length=1");
}
int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (ord < 0) {
return -1;
}
return counts[ord];
}
/**
* Overloaded method to allow getFacetResultForDim be called without passing in the
* dimToChildOrdsResult parameter
*/
private FacetResult getFacetResultForDim(String dim, int topNChildren) throws IOException {
return getFacetResultForDim(dim, topNChildren, null);
}
/** Returns FacetResult for a dimension. */
private FacetResult getFacetResultForDim(
String dim, int topNChildren, ChildOrdsResult dimToChildOrdsResult) throws IOException {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
DimTree dimTree = state.getDimTree(dim);
int dimOrd = dimTree.dimStartOrd;
return getPathResult(
dimConfig,
dim,
emptyPath,
dimOrd,
dimTree.iterator(),
topNChildren,
dimToChildOrdsResult);
} else {
OrdRange ordRange = state.getOrdRange(dim);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
return getPathResult(
dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, dimToChildOrdsResult);
}
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
validateTopN(topN);
List<FacetResult> results = new ArrayList<>();
for (String dim : state.getDims()) {
FacetResult factResult = getFacetResultForDim(dim, topN);
if (factResult != null) {
results.add(factResult);
}
}
// Sort by highest count:
Collections.sort(
results,
new Comparator<FacetResult>() {
@Override
public int compare(FacetResult a, FacetResult b) {
if (a.value.intValue() > b.value.intValue()) {
return -1;
} else if (b.value.intValue() > a.value.intValue()) {
return 1;
} else {
return a.dim.compareTo(b.dim);
}
}
});
return results;
}
@Override
public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException {
validateTopN(topNDims);
validateTopN(topNChildren);
// Creates priority queue to store top dimensions and sort by their aggregated values/hits and
// string values.
PriorityQueue<DimValueResult> pq =
new PriorityQueue<>(topNDims) {
@Override
protected boolean lessThan(DimValueResult a, DimValueResult b) {
if (a.value > b.value) {
return false;
} else if (a.value < b.value) {
return true;
} else {
return a.dim.compareTo(b.dim) > 0;
}
}
};
HashMap<String, ChildOrdsResult> dimToChildOrdsResult = new HashMap<>();
int dimCount;
for (String dim : state.getDims()) {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
DimTree dimTree = state.getDimTree(dim);
int dimOrd = dimTree.dimStartOrd;
// get dim value
dimCount =
getDimValue(
dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, dimToChildOrdsResult);
} else {
OrdRange ordRange = state.getOrdRange(dim);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
dimCount = getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, dimToChildOrdsResult);
}
if (dimCount != 0) {
// use priority queue to store DimValueResult for topNDims
if (pq.size() < topNDims) {
pq.add(new DimValueResult(dim, dimCount));
} else {
if (dimCount > pq.top().value
|| (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValueResult bottomDim = pq.top();
bottomDim.dim = dim;
bottomDim.value = dimCount;
pq.updateTop();
}
}
}
}
// get FacetResult for topNDims
int resultSize = pq.size();
FacetResult[] results = new FacetResult[resultSize];
while (pq.size() > 0) {
DimValueResult dimValueResult = pq.pop();
FacetResult facetResult =
getFacetResultForDim(
dimValueResult.dim, topNChildren, dimToChildOrdsResult.get(dimValueResult.dim));
resultSize--;
results[resultSize] = facetResult;
}
return Arrays.asList(results);
}
/**
* Creates ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top
* children
*/
private static class ChildOrdsResult {
final int dimCount;
final int childCount;
final TopOrdAndIntQueue q;
ChildOrdsResult(int dimCount, int childCount, TopOrdAndIntQueue q) {
this.dimCount = dimCount;
this.childCount = childCount;
this.q = q;
}
}
/**
* Creates DimValueResult to store the label and value of dim in order to sort by these two
* fields.
*/
private static class DimValueResult {
String dim;
int value;
DimValueResult(String dim, int value) {
this.dim = dim;
this.value = value;
}
}
}

View File

@ -39,9 +39,6 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
/** Per-ordinal value. */
final float[] values;
/** Pass in emptyPath for getTopDims and getAllDims. */
private static final String[] emptyPath = new String[0];
/** Sole constructor. */
FloatTaxonomyFacets(
String indexFieldName,
@ -114,21 +111,15 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
return null;
}
ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN);
if (childOrdsResult.aggregatedValue == 0) {
return null;
}
LabelAndValue[] labelValues = getLabelValues(childOrdsResult.q, cp.length);
return new FacetResult(
dim, path, childOrdsResult.aggregatedValue, labelValues, childOrdsResult.childCount);
TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN);
return createFacetResult(topChildrenForPath, dim, path);
}
/**
* Return ChildOrdsResult that contains results of aggregatedValue, childCount, and the queue for
* the dimension's top children to populate FacetResult in getPathResult.
* Determine the top-n children for a specified dimension + path. Results are in an intermediate
* form.
*/
private ChildOrdsResult getChildOrdsResult(DimConfig dimConfig, int dimOrd, int topN)
private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN)
throws IOException {
TopOrdAndFloatQueue q = new TopOrdAndFloatQueue(Math.min(taxoReader.getSize(), topN));
@ -137,7 +128,7 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
int[] children = getChildren();
int[] siblings = getSiblings();
int ord = children[dimOrd];
int ord = children[pathOrd];
float aggregatedValue = 0;
int childCount = 0;
@ -164,61 +155,50 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
aggregatedValue = values[dimOrd];
aggregatedValue = values[pathOrd];
} else {
// Our sum'd count is not correct, in general:
aggregatedValue = -1;
}
}
return new ChildOrdsResult(aggregatedValue, childCount, q);
return new TopChildrenForPath(aggregatedValue, childCount, q);
}
/**
* Return label and values for top dimensions and children
*
* @param q the queue for the dimension's top children
* @param pathLength the length of a dimension's children paths
* Create a FacetResult for the provided dim + path and intermediate results. Does the extra work
* of resolving ordinals -> labels, etc. Will return null if there are no children.
*/
private LabelAndValue[] getLabelValues(TopOrdAndFloatQueue q, int pathLength) throws IOException {
FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path)
throws IOException {
// If the intermediate result is null or there are no children, we return null:
if (topChildrenForPath == null || topChildrenForPath.childCount == 0) {
return null;
}
TopOrdAndFloatQueue q = topChildrenForPath.childQueue;
assert q != null;
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
int[] ordinals = new int[labelValues.length];
float[] values = new float[labelValues.length];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
ordinals[i] = ordAndValue.ord;
values[i] = ordAndValue.value;
}
FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
// The path component we're interested in is the one immediately after the provided path. We
// add 1 here to also account for the dim:
int childComponentIdx = path.length + 1;
for (int i = 0; i < labelValues.length; i++) {
labelValues[i] = new LabelAndValue(bulkPath[i].components[pathLength], values[i]);
}
return labelValues;
labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]);
}
/** Return value of a dimension. */
private float getDimValue(
FacetsConfig.DimConfig dimConfig,
String dim,
int dimOrd,
int topN,
HashMap<String, ChildOrdsResult> dimToChildOrdsResult)
throws IOException {
// if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at
// indexing time, return dimCount directly
if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) {
return values[dimOrd];
}
// if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount
ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN);
// if no early termination, store dim and childOrdsResult into a hashmap to avoid calling
// getChildOrdsResult again in getTopDims
dimToChildOrdsResult.put(dim, childOrdsResult);
return childOrdsResult.aggregatedValue;
return new FacetResult(
dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount);
}
@Override
@ -232,10 +212,10 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
// Create priority queue to store top dimensions and sort by their aggregated values/hits and
// string values.
PriorityQueue<DimValueResult> pq =
PriorityQueue<DimValue> pq =
new PriorityQueue<>(topNDims) {
@Override
protected boolean lessThan(DimValueResult a, DimValueResult b) {
protected boolean lessThan(DimValue a, DimValue b) {
if (a.value > b.value) {
return false;
} else if (a.value < b.value) {
@ -246,8 +226,8 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
}
};
// create hashMap to store the ChildOrdsResult to avoid calling getChildOrdsResult for all dims
HashMap<String, ChildOrdsResult> dimToChildOrdsResult = new HashMap<>();
// Keep track of intermediate results, if we compute them, so we can reuse them later:
Map<String, TopChildrenForPath> intermediateResults = null;
// iterate over children and siblings ordinals for all dims
int ord = children[TaxonomyReader.ROOT_ORDINAL];
@ -255,22 +235,42 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
String dim = taxoReader.getPath(ord).components[0];
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
if (dimConfig.indexFieldName.equals(indexFieldName)) {
FacetLabel cp = new FacetLabel(dim, emptyPath);
FacetLabel cp = new FacetLabel(dim);
int dimOrd = taxoReader.getOrdinal(cp);
float dimCount = 0;
// if dimOrd = -1, we skip this dim, else call getDimValue
if (dimOrd != -1) {
dimCount = getDimValue(dimConfig, dim, dimOrd, topNChildren, dimToChildOrdsResult);
if (dimCount != 0) {
// use priority queue to store DimValueResult for topNDims
if (pq.size() < topNDims) {
pq.add(new DimValueResult(dim, dimOrd, dimCount));
float dimValue;
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
// If the dim is configured as multi-valued and requires dim counts, we can access
// an accurate count for the dim computed at indexing time:
dimValue = values[dimOrd];
} else {
if (dimCount > pq.top().value
|| (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValueResult bottomDim = pq.top();
// If the dim is configured as multi-valued but not requiring dim counts, we cannot
// compute an accurate dim count, and use -1 as a place-holder:
dimValue = -1;
}
} else {
// Single-valued dims require aggregating descendant paths to get accurate dim counts
// since we don't directly access ancestry paths:
// TODO: We could consider indexing dim counts directly if getTopDims is a common
// use-case.
TopChildrenForPath topChildrenForPath =
getTopChildrenForPath(dimConfig, dimOrd, topNChildren);
if (intermediateResults == null) {
intermediateResults = new HashMap<>();
}
intermediateResults.put(dim, topChildrenForPath);
dimValue = topChildrenForPath.pathValue;
}
if (dimValue != 0) {
if (pq.size() < topNDims) {
pq.add(new DimValue(dim, dimOrd, dimValue));
} else {
if (dimValue > pq.top().value
|| (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValue bottomDim = pq.top();
bottomDim.dim = dim;
bottomDim.value = dimCount;
bottomDim.value = dimValue;
pq.updateTop();
}
}
@ -280,63 +280,40 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets {
ord = siblings[ord];
}
// use fixed-size array to reduce space usage
FacetResult[] results = new FacetResult[pq.size()];
while (pq.size() > 0) {
DimValueResult dimValueResult = pq.pop();
String dim = dimValueResult.dim;
ChildOrdsResult childOrdsResult;
// if the childOrdsResult was stored in the map, avoid calling getChildOrdsResult again
if (dimToChildOrdsResult.containsKey(dim)) {
childOrdsResult = dimToChildOrdsResult.get(dim);
} else {
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
childOrdsResult = getChildOrdsResult(dimConfig, dimValueResult.dimOrd, topNChildren);
DimValue dimValue = pq.pop();
assert dimValue != null;
String dim = dimValue.dim;
TopChildrenForPath topChildrenForPath = null;
if (intermediateResults != null) {
topChildrenForPath = intermediateResults.get(dim);
}
// FacetResult requires String[] path, and path is always empty for getTopDims.
// pathLength is always equal to 1 when FacetLabel is constructed with
// FacetLabel(dim, emptyPath), and therefore, 1 is passed in when calling getLabelValues
FacetResult facetResult =
new FacetResult(
dimValueResult.dim,
emptyPath,
dimValueResult.value,
getLabelValues(childOrdsResult.q, 1),
childOrdsResult.childCount);
if (topChildrenForPath == null) {
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren);
}
FacetResult facetResult = createFacetResult(topChildrenForPath, dim);
assert facetResult != null;
results[pq.size()] = facetResult;
}
return Arrays.asList(results);
}
/**
* Create DimValueResult to store the label, dim ordinal and dim count of a dim in priority queue
*/
private static class DimValueResult {
private static class DimValue {
String dim;
int dimOrd;
float value;
DimValueResult(String dim, int dimOrd, float value) {
DimValue(String dim, int dimOrd, float value) {
this.dim = dim;
this.dimOrd = dimOrd;
this.value = value;
}
}
/**
* Create ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top
* children
*/
private static class ChildOrdsResult {
final float aggregatedValue;
final int childCount;
final TopOrdAndFloatQueue q;
ChildOrdsResult(float aggregatedValue, int childCount, TopOrdAndFloatQueue q) {
this.aggregatedValue = aggregatedValue;
this.childCount = childCount;
this.q = q;
}
}
/** Intermediate result to store top children for a given path before resolving labels, etc. */
private record TopChildrenForPath(
float pathValue, int childCount, TopOrdAndFloatQueue childQueue) {}
}

View File

@ -44,9 +44,6 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
/** Sparse ordinal values. */
final IntIntHashMap sparseValues;
/** Pass in emptyPath for getTopDims and getAllDims. */
private static final String[] emptyPath = new String[0];
/** Sole constructor. */
IntTaxonomyFacets(
String indexFieldName,
@ -176,46 +173,15 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
return null;
}
ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN);
if (childOrdsResult.q == null || childOrdsResult.aggregatedValue == 0) {
return null;
}
LabelAndValue[] labelValues = getLabelValues(childOrdsResult.q, cp.length);
return new FacetResult(
dim, path, childOrdsResult.aggregatedValue, labelValues, childOrdsResult.childCount);
TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN);
return createFacetResult(topChildrenForPath, dim, path);
}
/**
* Return label and values for top dimensions and children
*
* @param q the queue for the dimension's top children
* @param pathLength the length of a dimension's children paths
* Determine the top-n children for a specified dimension + path. Results are in an intermediate
* form.
*/
private LabelAndValue[] getLabelValues(TopOrdAndIntQueue q, int pathLength) throws IOException {
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
int[] ordinals = new int[labelValues.length];
int[] values = new int[labelValues.length];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
ordinals[i] = ordAndValue.ord;
values[i] = ordAndValue.value;
}
FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
for (int i = 0; i < labelValues.length; i++) {
labelValues[i] = new LabelAndValue(bulkPath[i].components[pathLength], values[i]);
}
return labelValues;
}
/**
* Return ChildOrdsResult that contains results of dimCount, childCount, and the queue for the
* dimension's top children to populate FacetResult in getPathResult.
*/
private ChildOrdsResult getChildOrdsResult(DimConfig dimConfig, int dimOrd, int topN)
private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN)
throws IOException {
TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN));
int bottomValue = 0;
@ -230,7 +196,7 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
for (IntIntCursor c : sparseValues) {
int value = c.value;
int ord = c.key;
if (parents[ord] == dimOrd && value > 0) {
if (parents[ord] == pathOrd && value > 0) {
aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value);
childCount++;
if (value > bottomValue) {
@ -249,7 +215,7 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
} else {
int[] children = getChildren();
int[] siblings = getSiblings();
int ord = children[dimOrd];
int ord = children[pathOrd];
while (ord != TaxonomyReader.INVALID_ORDINAL) {
int value = values[ord];
if (value > 0) {
@ -273,38 +239,14 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
aggregatedValue = getValue(dimOrd);
aggregatedValue = getValue(pathOrd);
} else {
// Our sum'd value is not correct, in general:
aggregatedValue = -1;
}
}
return new ChildOrdsResult(aggregatedValue, childCount, q);
}
/** Return value/count of a dimension. */
private int getDimValue(
FacetsConfig.DimConfig dimConfig,
String dim,
int dimOrd,
int topN,
HashMap<String, ChildOrdsResult> dimToChildOrdsResult)
throws IOException {
// if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at
// indexing time, return dimCount directly
if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) {
return getValue(dimOrd);
}
// if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount
ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN);
// if no early termination, store dim and childOrdsResult into a hashmap to avoid calling
// getChildOrdsResult again in getTopDims
dimToChildOrdsResult.put(dim, childOrdsResult);
return childOrdsResult.aggregatedValue;
return new TopChildrenForPath(aggregatedValue, childCount, q);
}
@Override
@ -319,10 +261,10 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
// Create priority queue to store top dimensions and sort by their aggregated values/hits and
// string values.
PriorityQueue<DimValueResult> pq =
PriorityQueue<DimValue> pq =
new PriorityQueue<>(topNDims) {
@Override
protected boolean lessThan(DimValueResult a, DimValueResult b) {
protected boolean lessThan(DimValue a, DimValue b) {
if (a.value > b.value) {
return false;
} else if (a.value < b.value) {
@ -333,8 +275,8 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
}
};
// create hashMap to store the ChildOrdsResult to avoid calling getChildOrdsResult for all dims
HashMap<String, ChildOrdsResult> dimToChildOrdsResult = new HashMap<>();
// Keep track of intermediate results, if we compute them, so we can reuse them later:
Map<String, TopChildrenForPath> intermediateResults = null;
// iterate over children and siblings ordinals for all dims
int ord = children[TaxonomyReader.ROOT_ORDINAL];
@ -342,21 +284,42 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
String dim = taxoReader.getPath(ord).components[0];
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
if (dimConfig.indexFieldName.equals(indexFieldName)) {
FacetLabel cp = new FacetLabel(dim, emptyPath);
FacetLabel cp = new FacetLabel(dim);
int dimOrd = taxoReader.getOrdinal(cp);
// if dimOrd = -1, we skip this dim, else call getDimValue
if (dimOrd != -1) {
int dimCount = getDimValue(dimConfig, dim, dimOrd, topNChildren, dimToChildOrdsResult);
if (dimCount != 0) {
// use priority queue to store DimValueResult for topNDims
if (pq.size() < topNDims) {
pq.add(new DimValueResult(dim, dimOrd, dimCount));
int dimValue;
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
// If the dim is configured as multi-valued and requires dim counts, we can access
// an accurate count for the dim computed at indexing time:
dimValue = getValue(dimOrd);
} else {
if (dimCount > pq.top().value
|| (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValueResult bottomDim = pq.top();
// If the dim is configured as multi-valued but not requiring dim counts, we cannot
// compute an accurate dim count, and use -1 as a place-holder:
dimValue = -1;
}
} else {
// Single-valued dims require aggregating descendant paths to get accurate dim counts
// since we don't directly access ancestry paths:
// TODO: We could consider indexing dim counts directly if getTopDims is a common
// use-case.
TopChildrenForPath topChildrenForPath =
getTopChildrenForPath(dimConfig, dimOrd, topNChildren);
if (intermediateResults == null) {
intermediateResults = new HashMap<>();
}
intermediateResults.put(dim, topChildrenForPath);
dimValue = topChildrenForPath.pathValue;
}
if (dimValue != 0) {
if (pq.size() < topNDims) {
pq.add(new DimValue(dim, dimOrd, dimValue));
} else {
if (dimValue > pq.top().value
|| (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValue bottomDim = pq.top();
bottomDim.dim = dim;
bottomDim.value = dimCount;
bottomDim.value = dimValue;
pq.updateTop();
}
}
@ -366,63 +329,76 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
ord = siblings[ord];
}
// use fixed-size array to reduce space usage
FacetResult[] results = new FacetResult[pq.size()];
while (pq.size() > 0) {
DimValueResult dimValueResult = pq.pop();
String dim = dimValueResult.dim;
ChildOrdsResult childOrdsResult;
// if the childOrdsResult was stored in the map, avoid calling getChildOrdsResult again
if (dimToChildOrdsResult.containsKey(dim)) {
childOrdsResult = dimToChildOrdsResult.get(dim);
} else {
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
childOrdsResult = getChildOrdsResult(dimConfig, dimValueResult.dimOrd, topNChildren);
DimValue dimValue = pq.pop();
assert dimValue != null;
String dim = dimValue.dim;
TopChildrenForPath topChildrenForPath = null;
if (intermediateResults != null) {
topChildrenForPath = intermediateResults.get(dim);
}
// FacetResult requires String[] path, and path is always empty for getTopDims.
// pathLength is always equal to 1 when FacetLabel is constructed with
// FacetLabel(dim, emptyPath), and therefore, 1 is passed in when calling getLabelValues
FacetResult facetResult =
new FacetResult(
dimValueResult.dim,
emptyPath,
dimValueResult.value,
getLabelValues(childOrdsResult.q, 1),
childOrdsResult.childCount);
if (topChildrenForPath == null) {
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren);
}
FacetResult facetResult = createFacetResult(topChildrenForPath, dim);
assert facetResult != null;
results[pq.size()] = facetResult;
}
return Arrays.asList(results);
}
/**
* Create DimValueResult to store the label, dim ordinal and dim count of a dim in priority queue
* Create a FacetResult for the provided dim + path and intermediate results. Does the extra work
* of resolving ordinals -> labels, etc. Will return null if there are no children.
*/
private static class DimValueResult {
FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path)
throws IOException {
// If the intermediate result is null or there are no children, we return null:
if (topChildrenForPath == null || topChildrenForPath.childCount == 0) {
return null;
}
TopOrdAndIntQueue q = topChildrenForPath.childQueue;
assert q != null;
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
int[] ordinals = new int[labelValues.length];
int[] values = new int[labelValues.length];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
ordinals[i] = ordAndValue.ord;
values[i] = ordAndValue.value;
}
FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
// The path component we're interested in is the one immediately after the provided path. We
// add 1 here to also account for the dim:
int childComponentIdx = path.length + 1;
for (int i = 0; i < labelValues.length; i++) {
labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]);
}
return new FacetResult(
dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount);
}
private static class DimValue {
String dim;
int dimOrd;
int value;
DimValueResult(String dim, int dimOrd, int value) {
DimValue(String dim, int dimOrd, int value) {
this.dim = dim;
this.dimOrd = dimOrd;
this.value = value;
}
}
/**
* Create ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top
* children
*/
private static class ChildOrdsResult {
final int aggregatedValue;
final int childCount;
final TopOrdAndIntQueue q;
ChildOrdsResult(int aggregatedValue, int childCount, TopOrdAndIntQueue q) {
this.aggregatedValue = aggregatedValue;
this.childCount = childCount;
this.q = q;
}
}
/** Intermediate result to store top children for a given path before resolving labels, etc. */
private record TopChildrenForPath(int pathValue, int childCount, TopOrdAndIntQueue childQueue) {}
}

View File

@ -19,7 +19,6 @@ package org.apache.lucene.facet.taxonomy;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
@ -156,7 +155,7 @@ abstract class TaxonomyFacets extends Facets {
}
// Sort by highest value, tie break by dim:
Collections.sort(results, BY_VALUE_THEN_DIM);
results.sort(BY_VALUE_THEN_DIM);
return results;
}
}