mirror of
https://github.com/apache/lucene.git
synced 2025-02-10 03:55:46 +00:00
LUCENE-10488: Optimize Facets#getTopDims in IntTaxonomyFacets (#779)
This commit is contained in:
parent
ef43242d77
commit
57f8cb2fd6
@ -19,6 +19,9 @@ package org.apache.lucene.facet.taxonomy;
|
|||||||
import com.carrotsearch.hppc.IntIntHashMap;
|
import com.carrotsearch.hppc.IntIntHashMap;
|
||||||
import com.carrotsearch.hppc.cursors.IntIntCursor;
|
import com.carrotsearch.hppc.cursors.IntIntCursor;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import org.apache.lucene.facet.FacetResult;
|
import org.apache.lucene.facet.FacetResult;
|
||||||
import org.apache.lucene.facet.FacetsCollector;
|
import org.apache.lucene.facet.FacetsCollector;
|
||||||
@ -27,6 +30,7 @@ import org.apache.lucene.facet.FacetsConfig;
|
|||||||
import org.apache.lucene.facet.FacetsConfig.DimConfig;
|
import org.apache.lucene.facet.FacetsConfig.DimConfig;
|
||||||
import org.apache.lucene.facet.LabelAndValue;
|
import org.apache.lucene.facet.LabelAndValue;
|
||||||
import org.apache.lucene.facet.TopOrdAndIntQueue;
|
import org.apache.lucene.facet.TopOrdAndIntQueue;
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
/** Base class for all taxonomy-based facets that aggregate to a per-ords int[]. */
|
/** Base class for all taxonomy-based facets that aggregate to a per-ords int[]. */
|
||||||
abstract class IntTaxonomyFacets extends TaxonomyFacets {
|
abstract class IntTaxonomyFacets extends TaxonomyFacets {
|
||||||
@ -40,6 +44,9 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
|
|||||||
/** Sparse ordinal values. */
|
/** Sparse ordinal values. */
|
||||||
final IntIntHashMap sparseValues;
|
final IntIntHashMap sparseValues;
|
||||||
|
|
||||||
|
/** Pass in emptyPath for getTopDims and getAllDims. */
|
||||||
|
private static final String[] emptyPath = new String[0];
|
||||||
|
|
||||||
/** Sole constructor. */
|
/** Sole constructor. */
|
||||||
IntTaxonomyFacets(
|
IntTaxonomyFacets(
|
||||||
String indexFieldName,
|
String indexFieldName,
|
||||||
@ -169,18 +176,56 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN));
|
ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN);
|
||||||
|
|
||||||
|
if (childOrdsResult.q == null || childOrdsResult.aggregatedValue == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
LabelAndValue[] labelValues = getLabelValues(childOrdsResult.q, cp.length);
|
||||||
|
return new FacetResult(
|
||||||
|
dim, path, childOrdsResult.aggregatedValue, labelValues, childOrdsResult.childCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return label and values for top dimensions and children
|
||||||
|
*
|
||||||
|
* @param q the queue for the dimension's top children
|
||||||
|
* @param pathLength the length of a dimension's children paths
|
||||||
|
*/
|
||||||
|
private LabelAndValue[] getLabelValues(TopOrdAndIntQueue q, int pathLength) throws IOException {
|
||||||
|
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
|
||||||
|
int[] ordinals = new int[labelValues.length];
|
||||||
|
int[] values = new int[labelValues.length];
|
||||||
|
|
||||||
|
for (int i = labelValues.length - 1; i >= 0; i--) {
|
||||||
|
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
|
||||||
|
ordinals[i] = ordAndValue.ord;
|
||||||
|
values[i] = ordAndValue.value;
|
||||||
|
}
|
||||||
|
|
||||||
|
FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
|
||||||
|
for (int i = 0; i < labelValues.length; i++) {
|
||||||
|
labelValues[i] = new LabelAndValue(bulkPath[i].components[pathLength], values[i]);
|
||||||
|
}
|
||||||
|
return labelValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return ChildOrdsResult that contains results of dimCount, childCount, and the queue for the
|
||||||
|
* dimension's top children to populate FacetResult in getPathResult.
|
||||||
|
*/
|
||||||
|
private ChildOrdsResult getChildOrdsResult(DimConfig dimConfig, int dimOrd, int topN)
|
||||||
|
throws IOException {
|
||||||
|
TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN));
|
||||||
int bottomValue = 0;
|
int bottomValue = 0;
|
||||||
|
|
||||||
int aggregatedValue = 0;
|
int aggregatedValue = 0;
|
||||||
int childCount = 0;
|
int childCount = 0;
|
||||||
|
|
||||||
TopOrdAndIntQueue.OrdAndValue reuse = null;
|
TopOrdAndIntQueue.OrdAndValue reuse = null;
|
||||||
|
|
||||||
// TODO: would be faster if we had a "get the following children" API? then we
|
// TODO: would be faster if we had a "get the following children" API? then we
|
||||||
// can make a single pass over the hashmap
|
// can make a single pass over the hashmap
|
||||||
|
|
||||||
if (sparseValues != null) {
|
if (sparseValues != null) {
|
||||||
for (IntIntCursor c : sparseValues) {
|
for (IntIntCursor c : sparseValues) {
|
||||||
int value = c.value;
|
int value = c.value;
|
||||||
@ -222,15 +267,10 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ord = siblings[ord];
|
ord = siblings[ord];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (aggregatedValue == 0) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dimConfig.multiValued) {
|
if (dimConfig.multiValued) {
|
||||||
if (dimConfig.requireDimCount) {
|
if (dimConfig.requireDimCount) {
|
||||||
aggregatedValue = getValue(dimOrd);
|
aggregatedValue = getValue(dimOrd);
|
||||||
@ -238,25 +278,151 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets {
|
|||||||
// Our sum'd value is not correct, in general:
|
// Our sum'd value is not correct, in general:
|
||||||
aggregatedValue = -1;
|
aggregatedValue = -1;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Our sum'd dim value is accurate, so we keep it
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
|
return new ChildOrdsResult(aggregatedValue, childCount, q);
|
||||||
int[] ordinals = new int[labelValues.length];
|
}
|
||||||
int[] values = new int[labelValues.length];
|
|
||||||
|
|
||||||
for (int i = labelValues.length - 1; i >= 0; i--) {
|
/** Return value/count of a dimension. */
|
||||||
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
|
private int getDimValue(
|
||||||
ordinals[i] = ordAndValue.ord;
|
FacetsConfig.DimConfig dimConfig,
|
||||||
values[i] = ordAndValue.value;
|
String dim,
|
||||||
|
int dimOrd,
|
||||||
|
int topN,
|
||||||
|
HashMap<String, ChildOrdsResult> dimToChildOrdsResult)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
// if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at
|
||||||
|
// indexing time, return dimCount directly
|
||||||
|
if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) {
|
||||||
|
return getValue(dimOrd);
|
||||||
}
|
}
|
||||||
|
|
||||||
FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
|
// if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount
|
||||||
for (int i = 0; i < labelValues.length; i++) {
|
ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN);
|
||||||
labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]);
|
|
||||||
|
// if no early termination, store dim and childOrdsResult into a hashmap to avoid calling
|
||||||
|
// getChildOrdsResult again in getTopDims
|
||||||
|
dimToChildOrdsResult.put(dim, childOrdsResult);
|
||||||
|
return childOrdsResult.aggregatedValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<FacetResult> getTopDims(int topNDims, int topNChildren) throws IOException {
|
||||||
|
if (topNDims <= 0 || topNChildren <= 0) {
|
||||||
|
throw new IllegalArgumentException("topN must be > 0");
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FacetResult(dim, path, aggregatedValue, labelValues, childCount);
|
// get children and siblings ordinal array from TaxonomyFacets
|
||||||
|
int[] children = getChildren();
|
||||||
|
int[] siblings = getSiblings();
|
||||||
|
|
||||||
|
// Create priority queue to store top dimensions and sort by their aggregated values/hits and
|
||||||
|
// string values.
|
||||||
|
PriorityQueue<DimValueResult> pq =
|
||||||
|
new PriorityQueue<>(topNDims) {
|
||||||
|
@Override
|
||||||
|
protected boolean lessThan(DimValueResult a, DimValueResult b) {
|
||||||
|
if (a.value > b.value) {
|
||||||
|
return false;
|
||||||
|
} else if (a.value < b.value) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return a.dim.compareTo(b.dim) > 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// create hashMap to store the ChildOrdsResult to avoid calling getChildOrdsResult for all dims
|
||||||
|
HashMap<String, ChildOrdsResult> dimToChildOrdsResult = new HashMap<>();
|
||||||
|
|
||||||
|
// iterate over children and siblings ordinals for all dims
|
||||||
|
int ord = children[TaxonomyReader.ROOT_ORDINAL];
|
||||||
|
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||||
|
String dim = taxoReader.getPath(ord).components[0];
|
||||||
|
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
|
||||||
|
if (dimConfig.indexFieldName.equals(indexFieldName)) {
|
||||||
|
FacetLabel cp = new FacetLabel(dim, emptyPath);
|
||||||
|
int dimOrd = taxoReader.getOrdinal(cp);
|
||||||
|
// if dimOrd = -1, we skip this dim, else call getDimValue
|
||||||
|
if (dimOrd != -1) {
|
||||||
|
int dimCount = getDimValue(dimConfig, dim, dimOrd, topNChildren, dimToChildOrdsResult);
|
||||||
|
if (dimCount != 0) {
|
||||||
|
// use priority queue to store DimValueResult for topNDims
|
||||||
|
if (pq.size() < topNDims) {
|
||||||
|
pq.add(new DimValueResult(dim, dimOrd, dimCount));
|
||||||
|
} else {
|
||||||
|
if (dimCount > pq.top().value
|
||||||
|
|| (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
|
||||||
|
DimValueResult bottomDim = pq.top();
|
||||||
|
bottomDim.dim = dim;
|
||||||
|
bottomDim.value = dimCount;
|
||||||
|
pq.updateTop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ord = siblings[ord];
|
||||||
|
}
|
||||||
|
|
||||||
|
// use fixed-size array to reduce space usage
|
||||||
|
FacetResult[] results = new FacetResult[pq.size()];
|
||||||
|
|
||||||
|
while (pq.size() > 0) {
|
||||||
|
DimValueResult dimValueResult = pq.pop();
|
||||||
|
String dim = dimValueResult.dim;
|
||||||
|
ChildOrdsResult childOrdsResult;
|
||||||
|
// if the childOrdsResult was stored in the map, avoid calling getChildOrdsResult again
|
||||||
|
if (dimToChildOrdsResult.containsKey(dim)) {
|
||||||
|
childOrdsResult = dimToChildOrdsResult.get(dim);
|
||||||
|
} else {
|
||||||
|
FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
|
||||||
|
childOrdsResult = getChildOrdsResult(dimConfig, dimValueResult.dimOrd, topNChildren);
|
||||||
|
}
|
||||||
|
// FacetResult requires String[] path, and path is always empty for getTopDims.
|
||||||
|
// pathLength is always equal to 1 when FacetLabel is constructed with
|
||||||
|
// FacetLabel(dim, emptyPath), and therefore, 1 is passed in when calling getLabelValues
|
||||||
|
FacetResult facetResult =
|
||||||
|
new FacetResult(
|
||||||
|
dimValueResult.dim,
|
||||||
|
emptyPath,
|
||||||
|
dimValueResult.value,
|
||||||
|
getLabelValues(childOrdsResult.q, 1),
|
||||||
|
childOrdsResult.childCount);
|
||||||
|
results[pq.size()] = facetResult;
|
||||||
|
}
|
||||||
|
return Arrays.asList(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create DimValueResult to store the label, dim ordinal and dim count of a dim in priority queue
|
||||||
|
*/
|
||||||
|
private static class DimValueResult {
|
||||||
|
String dim;
|
||||||
|
int dimOrd;
|
||||||
|
int value;
|
||||||
|
|
||||||
|
DimValueResult(String dim, int dimOrd, int value) {
|
||||||
|
this.dim = dim;
|
||||||
|
this.dimOrd = dimOrd;
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top
|
||||||
|
* children
|
||||||
|
*/
|
||||||
|
private static class ChildOrdsResult {
|
||||||
|
final int aggregatedValue;
|
||||||
|
final int childCount;
|
||||||
|
final TopOrdAndIntQueue q;
|
||||||
|
|
||||||
|
ChildOrdsResult(int aggregatedValue, int childCount, TopOrdAndIntQueue q) {
|
||||||
|
this.aggregatedValue = aggregatedValue;
|
||||||
|
this.childCount = childCount;
|
||||||
|
this.q = q;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -128,6 +128,12 @@ public class TestTaxonomyFacetCounts extends FacetTestCase {
|
|||||||
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
|
"dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n",
|
||||||
facets.getTopChildren(10, "Author").toString());
|
facets.getTopChildren(10, "Author").toString());
|
||||||
|
|
||||||
|
// test getAllDims
|
||||||
|
List<FacetResult> results = facets.getAllDims(10);
|
||||||
|
// test getTopDims(10, 10) and expect same results from getAllDims(10)
|
||||||
|
List<FacetResult> allTopDimsResults = facets.getTopDims(10, 10);
|
||||||
|
assertEquals(results, allTopDimsResults);
|
||||||
|
|
||||||
// Now user drills down on Publish Date/2010:
|
// Now user drills down on Publish Date/2010:
|
||||||
DrillDownQuery q2 = new DrillDownQuery(config);
|
DrillDownQuery q2 = new DrillDownQuery(config);
|
||||||
q2.add("Publish Date", "2010");
|
q2.add("Publish Date", "2010");
|
||||||
@ -242,8 +248,11 @@ public class TestTaxonomyFacetCounts extends FacetTestCase {
|
|||||||
assertEquals(results, allDimsResults);
|
assertEquals(results, allDimsResults);
|
||||||
|
|
||||||
// test getTopDims(0, 1)
|
// test getTopDims(0, 1)
|
||||||
List<FacetResult> topDimsResults2 = facets.getTopDims(0, 1);
|
expectThrows(
|
||||||
assertEquals(0, topDimsResults2.size());
|
IllegalArgumentException.class,
|
||||||
|
() -> {
|
||||||
|
facets.getTopDims(0, 1);
|
||||||
|
});
|
||||||
|
|
||||||
// test getTopDims(1, 0) with topNChildren = 0
|
// test getTopDims(1, 0) with topNChildren = 0
|
||||||
expectThrows(
|
expectThrows(
|
||||||
@ -287,6 +296,11 @@ public class TestTaxonomyFacetCounts extends FacetTestCase {
|
|||||||
// Ask for top 10 labels for any dims that have counts:
|
// Ask for top 10 labels for any dims that have counts:
|
||||||
List<FacetResult> results = facets.getAllDims(10);
|
List<FacetResult> results = facets.getAllDims(10);
|
||||||
assertTrue(results.isEmpty());
|
assertTrue(results.isEmpty());
|
||||||
|
|
||||||
|
// test getTopDims(10, 10) and expect same results from getAllDims(10)
|
||||||
|
List<FacetResult> allTopDimsResults = facets.getTopDims(10, 10);
|
||||||
|
assertEquals(results, allTopDimsResults);
|
||||||
|
|
||||||
expectThrows(
|
expectThrows(
|
||||||
IllegalArgumentException.class,
|
IllegalArgumentException.class,
|
||||||
() -> {
|
() -> {
|
||||||
@ -642,15 +656,18 @@ public class TestTaxonomyFacetCounts extends FacetTestCase {
|
|||||||
assertEquals(r.numDocs(), result.value.intValue());
|
assertEquals(r.numDocs(), result.value.intValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
// test default implementation of getTopDims
|
// test override implementation of getTopDims
|
||||||
if (allDimsResult.size() > 0) {
|
if (allDimsResult.size() > 0) {
|
||||||
List<FacetResult> topNDimsResult = facets.getTopDims(1, 10);
|
List<FacetResult> topNDimsResult = facets.getTopDims(1, 10);
|
||||||
assertEquals(allDimsResult.get(0), topNDimsResult.get(0));
|
assertEquals(allDimsResult.get(0), topNDimsResult.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
// test getTopDims(0, 1)
|
// test getTopDims(0, 1)
|
||||||
List<FacetResult> topDimsResults2 = facets.getTopDims(0, 1);
|
expectThrows(
|
||||||
assertEquals(0, topDimsResults2.size());
|
IllegalArgumentException.class,
|
||||||
|
() -> {
|
||||||
|
facets.getTopDims(0, 1);
|
||||||
|
});
|
||||||
|
|
||||||
// test getTopDims(1, 0) with topNChildren = 0
|
// test getTopDims(1, 0) with topNChildren = 0
|
||||||
expectThrows(
|
expectThrows(
|
||||||
@ -695,10 +712,11 @@ public class TestTaxonomyFacetCounts extends FacetTestCase {
|
|||||||
assertEquals(
|
assertEquals(
|
||||||
"calling getFacetResults twice should return the .equals()=true result", res1, res2);
|
"calling getFacetResults twice should return the .equals()=true result", res1, res2);
|
||||||
|
|
||||||
// test default implementation of getTopDims
|
// test getTopDims(n, 10)
|
||||||
if (res1.size() > 0) {
|
if (res1.size() > 0) {
|
||||||
List<FacetResult> topNDimsResult = facets.getTopDims(1, 10);
|
for (int i = 1; i < res1.size(); i++) {
|
||||||
assertEquals(res1.get(0), topNDimsResult.get(0));
|
assertEquals(res1.subList(0, i), facets.getTopDims(i, 10));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
iw.close();
|
iw.close();
|
||||||
@ -995,11 +1013,12 @@ public class TestTaxonomyFacetCounts extends FacetTestCase {
|
|||||||
|
|
||||||
assertEquals(expected, actual);
|
assertEquals(expected, actual);
|
||||||
|
|
||||||
// test default implementation of getTopDims
|
// test getTopDims
|
||||||
|
if (actual.size() > 0) {
|
||||||
List<FacetResult> topNDimsResult = facets.getTopDims(actual.size(), 10);
|
List<FacetResult> topNDimsResult = facets.getTopDims(actual.size(), 10);
|
||||||
sortTies(topNDimsResult);
|
sortTies(topNDimsResult);
|
||||||
assertEquals(actual, topNDimsResult);
|
assertEquals(actual, topNDimsResult);
|
||||||
|
}
|
||||||
|
|
||||||
// Test facet labels for each matching test doc
|
// Test facet labels for each matching test doc
|
||||||
List<List<FacetLabel>> actualLabels = getAllTaxonomyFacetLabels(null, tr, fc);
|
List<List<FacetLabel>> actualLabels = getAllTaxonomyFacetLabels(null, tr, fc);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user