LUCENE-10488: Optimized getTopDims in ConcurrentSSDVFacetCounts (#777)

This commit is contained in:
Yuting Gan 2022-05-13 15:54:18 -07:00 committed by GitHub
parent 2cca0e8441
commit ef43242d77
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 271 additions and 42 deletions

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.PrimitiveIterator;
import java.util.concurrent.Callable;
@ -54,6 +55,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.PriorityQueue;
/**
* Like {@link SortedSetDocValuesFacetCounts}, but aggregates counts concurrently across segments.
@ -131,6 +133,10 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
}
/**
* Overloaded method to allow getPathResult be called without passing in the dimToChildOrdsResult
* parameter
*/
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
@ -139,16 +145,59 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
PrimitiveIterator.OfInt childOrds,
int topN)
throws IOException {
return getPathResult(dimConfig, dim, path, pathOrd, childOrds, topN, null);
}
/** Returns path results for a dimension */
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
String[] path,
int pathOrd,
PrimitiveIterator.OfInt childOrds,
int topN,
ChildOrdsResult dimToChildOrdsResult)
throws IOException {
ChildOrdsResult childOrdsResult;
// if getTopDims is called, get results from previously stored dimToChildOrdsResult, otherwise
// call getChildOrdsResult to get dimCount, childCount and the queue for the dimension's top
// children
if (dimToChildOrdsResult != null) {
childOrdsResult = dimToChildOrdsResult;
} else {
childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, pathOrd);
}
if (childOrdsResult.q == null) {
return null;
}
LabelAndValue[] labelValues = getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q);
if (dimConfig.hierarchical == true) {
return new FacetResult(
dim, path, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount);
} else {
return new FacetResult(
dim, emptyPath, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount);
}
}
/**
* Returns ChildOrdsResult that contains results of dimCount, childCount, and the queue for the
* dimension's top children to populate FacetResult in getPathResult.
*/
private ChildOrdsResult getChildOrdsResult(
PrimitiveIterator.OfInt childOrds, int topN, FacetsConfig.DimConfig dimConfig, int pathOrd) {
TopOrdAndIntQueue q = null;
int bottomCount = 0;
int dimCount = 0;
int childCount = 0;
TopOrdAndIntQueue.OrdAndValue reuse = null;
while (childOrds.hasNext()) {
int ord = childOrds.next();
if (counts.get(ord) > 0) {
@ -173,20 +222,9 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
}
}
if (q == null) {
return null;
}
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
}
if (dimConfig.hierarchical == false) {
if (dimConfig.hierarchical == true) {
dimCount = counts.get(pathOrd);
} else {
// see if dimCount is actually reliable or needs to be reset
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
@ -195,10 +233,47 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
}
}
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
} else {
return new FacetResult(dim, path, counts.get(pathOrd), labelValues, childCount);
}
return new ChildOrdsResult(dimCount, childCount, q);
}
/** Returns label values for dims. */
private LabelAndValue[] getLabelValuesFromTopOrdAndIntQueue(TopOrdAndIntQueue q)
throws IOException {
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
}
return labelValues;
}
/** Returns value/count of a dimension. */
private int getDimValue(
FacetsConfig.DimConfig dimConfig,
String dim,
int dimOrd,
PrimitiveIterator.OfInt childOrds,
int topN,
HashMap<String, ChildOrdsResult> dimToChildOrdsResult) {
// if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at
// indexing time, return dimCount directly
if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) {
return counts.get(dimOrd);
}
// if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount
ChildOrdsResult childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, dimOrd);
// if no early termination, store dim and childOrdsResult into a hashmap to avoid calling
// getChildOrdsResult again in getPathResult
dimToChildOrdsResult.put(dim, childOrdsResult);
return childOrdsResult.dimCount;
}
private class CountOneSegment implements Callable<Void> {
@ -474,19 +549,31 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
return counts.get(ord);
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
validateTopN(topN);
List<FacetResult> results = new ArrayList<>();
for (String dim : state.getDims()) {
/**
* Overloaded method to allow getFacetResultForDim be called without passing in the
* dimToChildOrdsResult parameter
*/
private FacetResult getFacetResultForDim(String dim, int topNChildren) throws IOException {
return getFacetResultForDim(dim, topNChildren, null);
}
/** Returns FacetResult for a dimension. */
private FacetResult getFacetResultForDim(
String dim, int topNChildren, ChildOrdsResult dimToChildOrdsResult) throws IOException {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
int dimOrd = dimTree.dimStartOrd;
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN);
if (fr != null) {
results.add(fr);
}
return getPathResult(
dimConfig,
dim,
emptyPath,
dimOrd,
dimTree.iterator(),
topNChildren,
dimToChildOrdsResult);
} else {
OrdRange ordRange = state.getOrdRange(dim);
int dimOrd = ordRange.start;
@ -497,11 +584,20 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
// child:
childIt.next();
}
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN);
if (fr != null) {
results.add(fr);
return getPathResult(
dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, dimToChildOrdsResult);
}
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
validateTopN(topN);
List<FacetResult> results = new ArrayList<>();
for (String dim : state.getDims()) {
FacetResult factResult = getFacetResultForDim(dim, topN);
if (factResult != null) {
results.add(factResult);
}
}
// Sort by highest count:
@ -522,4 +618,112 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
return results;
}
@Override
public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException {
if (topNDims <= 0 || topNChildren <= 0) {
throw new IllegalArgumentException("topN must be > 0");
}
// Creates priority queue to store top dimensions and sort by their aggregated values/hits and
// string values.
PriorityQueue<DimValueResult> pq =
new PriorityQueue<>(topNDims) {
@Override
protected boolean lessThan(DimValueResult a, DimValueResult b) {
if (a.value > b.value) {
return false;
} else if (a.value < b.value) {
return true;
} else {
return a.dim.compareTo(b.dim) > 0;
}
}
};
HashMap<String, ChildOrdsResult> dimToChildOrdsResult = new HashMap<>();
int dimCount;
for (String dim : state.getDims()) {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
int dimOrd = dimTree.dimStartOrd;
// get dim value
dimCount =
getDimValue(
dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, dimToChildOrdsResult);
} else {
OrdRange ordRange = state.getOrdRange(dim);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
dimCount = getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, dimToChildOrdsResult);
}
if (dimCount != 0) {
// use priority queue to store DimValueResult for topNDims
if (pq.size() < topNDims) {
pq.add(new DimValueResult(dim, dimCount));
} else {
if (dimCount > pq.top().value
|| (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
DimValueResult bottomDim = pq.top();
bottomDim.dim = dim;
bottomDim.value = dimCount;
pq.updateTop();
}
}
}
}
// get FacetResult for topNDims
int resultSize = pq.size();
FacetResult[] results = new FacetResult[resultSize];
while (pq.size() > 0) {
DimValueResult dimValueResult = pq.pop();
FacetResult facetResult =
getFacetResultForDim(
dimValueResult.dim, topNChildren, dimToChildOrdsResult.get(dimValueResult.dim));
resultSize--;
results[resultSize] = facetResult;
}
return Arrays.asList(results);
}
/**
* Creates ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top
* children
*/
private static class ChildOrdsResult {
final int dimCount;
final int childCount;
final TopOrdAndIntQueue q;
ChildOrdsResult(int dimCount, int childCount, TopOrdAndIntQueue q) {
this.dimCount = dimCount;
this.childCount = childCount;
this.q = q;
}
}
/**
* Creates DimValueResult to store the label and value of dim in order to sort by these two
* fields.
*/
private static class DimValueResult {
String dim;
int value;
DimValueResult(String dim, int value) {
this.dim = dim;
this.value = value;
}
}
}

View File

@ -435,6 +435,19 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
assertEquals(
"dim=a path=[] value=1 childCount=1\n bar (1)\n",
facets.getTopChildren(10, "a").toString());
// test getTopDims in ConcurrentSortedSetDocValuesFacetCounts
List<FacetResult> results = facets.getAllDims(10);
// test getTopDims(10, 10) and expect same results from getAllDims(10)
List<FacetResult> allTopDimsResults = facets.getTopDims(10, 10);
// test getTopDims(n, 10)
if (allTopDimsResults.size() > 0) {
for (int i = 1; i < results.size(); i++) {
assertEquals(results.subList(0, i), facets.getTopDims(i, 10));
}
}
} finally {
exec.shutdownNow();
}
@ -495,6 +508,18 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
assertEquals(
"dim=b path=[buzz] value=1 childCount=1\n baz (1)\n",
facets.getTopChildren(10, "b", "buzz").toString());
// test getTopDims in ConcurrentSortedSetDocValuesFacetCounts
List<FacetResult> results = facets.getAllDims(10);
// test getTopDims(10, 10) and expect same results from getAllDims(10)
List<FacetResult> allTopDimsResults = facets.getTopDims(10, 10);
// test getTopDims(n, 10)
if (allTopDimsResults.size() > 0) {
for (int i = 1; i < results.size(); i++) {
assertEquals(results.subList(0, i), facets.getTopDims(i, 10));
}
}
} finally {
exec.shutdownNow();
}