diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java index f3d9ac18eea..b6c2fdcf22c 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java @@ -19,6 +19,9 @@ package org.apache.lucene.facet.taxonomy; import com.carrotsearch.hppc.IntIntHashMap; import com.carrotsearch.hppc.cursors.IntIntCursor; import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetsCollector; @@ -27,6 +30,7 @@ import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.FacetsConfig.DimConfig; import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.TopOrdAndIntQueue; +import org.apache.lucene.util.PriorityQueue; /** Base class for all taxonomy-based facets that aggregate to a per-ords int[]. */ abstract class IntTaxonomyFacets extends TaxonomyFacets { @@ -40,6 +44,9 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { /** Sparse ordinal values. */ final IntIntHashMap sparseValues; + /** Pass in emptyPath for getTopDims and getAllDims. */ + private static final String[] emptyPath = new String[0]; + /** Sole constructor. */ IntTaxonomyFacets( String indexFieldName, @@ -169,18 +176,56 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { return null; } - TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); + ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN); + if (childOrdsResult.q == null || childOrdsResult.aggregatedValue == 0) { + return null; + } + + LabelAndValue[] labelValues = getLabelValues(childOrdsResult.q, cp.length); + return new FacetResult( + dim, path, childOrdsResult.aggregatedValue, labelValues, childOrdsResult.childCount); + } + + /** + * Return label and values for top dimensions and children + * + * @param q the queue for the dimension's top children + * @param pathLength the length of a dimension's children paths + */ + private LabelAndValue[] getLabelValues(TopOrdAndIntQueue q, int pathLength) throws IOException { + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + int[] values = new int[labelValues.length]; + + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[pathLength], values[i]); + } + return labelValues; + } + + /** + * Return ChildOrdsResult that contains results of dimCount, childCount, and the queue for the + * dimension's top children to populate FacetResult in getPathResult. + */ + private ChildOrdsResult getChildOrdsResult(DimConfig dimConfig, int dimOrd, int topN) + throws IOException { + TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); int bottomValue = 0; int aggregatedValue = 0; int childCount = 0; - TopOrdAndIntQueue.OrdAndValue reuse = null; // TODO: would be faster if we had a "get the following children" API? then we // can make a single pass over the hashmap - if (sparseValues != null) { for (IntIntCursor c : sparseValues) { int value = c.value; @@ -222,15 +267,10 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { } } } - ord = siblings[ord]; } } - if (aggregatedValue == 0) { - return null; - } - if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { aggregatedValue = getValue(dimOrd); @@ -238,25 +278,151 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { // Our sum'd value is not correct, in general: aggregatedValue = -1; } - } else { - // Our sum'd dim value is accurate, so we keep it } - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - int[] ordinals = new int[labelValues.length]; - int[] values = new int[labelValues.length]; + return new ChildOrdsResult(aggregatedValue, childCount, q); + } - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - ordinals[i] = ordAndValue.ord; - values[i] = ordAndValue.value; + /** Return value/count of a dimension. */ + private int getDimValue( + FacetsConfig.DimConfig dimConfig, + String dim, + int dimOrd, + int topN, + HashMap dimToChildOrdsResult) + throws IOException { + + // if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at + // indexing time, return dimCount directly + if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) { + return getValue(dimOrd); } - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], values[i]); + // if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount + ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN); + + // if no early termination, store dim and childOrdsResult into a hashmap to avoid calling + // getChildOrdsResult again in getTopDims + dimToChildOrdsResult.put(dim, childOrdsResult); + return childOrdsResult.aggregatedValue; + } + + @Override + public List getTopDims(int topNDims, int topNChildren) throws IOException { + if (topNDims <= 0 || topNChildren <= 0) { + throw new IllegalArgumentException("topN must be > 0"); } - return new FacetResult(dim, path, aggregatedValue, labelValues, childCount); + // get children and siblings ordinal array from TaxonomyFacets + int[] children = getChildren(); + int[] siblings = getSiblings(); + + // Create priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan(DimValueResult a, DimValueResult b) { + if (a.value > b.value) { + return false; + } else if (a.value < b.value) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + // create hashMap to store the ChildOrdsResult to avoid calling getChildOrdsResult for all dims + HashMap dimToChildOrdsResult = new HashMap<>(); + + // iterate over children and siblings ordinals for all dims + int ord = children[TaxonomyReader.ROOT_ORDINAL]; + while (ord != TaxonomyReader.INVALID_ORDINAL) { + String dim = taxoReader.getPath(ord).components[0]; + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + if (dimConfig.indexFieldName.equals(indexFieldName)) { + FacetLabel cp = new FacetLabel(dim, emptyPath); + int dimOrd = taxoReader.getOrdinal(cp); + // if dimOrd = -1, we skip this dim, else call getDimValue + if (dimOrd != -1) { + int dimCount = getDimValue(dimConfig, dim, dimOrd, topNChildren, dimToChildOrdsResult); + if (dimCount != 0) { + // use priority queue to store DimValueResult for topNDims + if (pq.size() < topNDims) { + pq.add(new DimValueResult(dim, dimOrd, dimCount)); + } else { + if (dimCount > pq.top().value + || (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { + DimValueResult bottomDim = pq.top(); + bottomDim.dim = dim; + bottomDim.value = dimCount; + pq.updateTop(); + } + } + } + } + } + ord = siblings[ord]; + } + + // use fixed-size array to reduce space usage + FacetResult[] results = new FacetResult[pq.size()]; + + while (pq.size() > 0) { + DimValueResult dimValueResult = pq.pop(); + String dim = dimValueResult.dim; + ChildOrdsResult childOrdsResult; + // if the childOrdsResult was stored in the map, avoid calling getChildOrdsResult again + if (dimToChildOrdsResult.containsKey(dim)) { + childOrdsResult = dimToChildOrdsResult.get(dim); + } else { + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + childOrdsResult = getChildOrdsResult(dimConfig, dimValueResult.dimOrd, topNChildren); + } + // FacetResult requires String[] path, and path is always empty for getTopDims. + // pathLength is always equal to 1 when FacetLabel is constructed with + // FacetLabel(dim, emptyPath), and therefore, 1 is passed in when calling getLabelValues + FacetResult facetResult = + new FacetResult( + dimValueResult.dim, + emptyPath, + dimValueResult.value, + getLabelValues(childOrdsResult.q, 1), + childOrdsResult.childCount); + results[pq.size()] = facetResult; + } + return Arrays.asList(results); + } + + /** + * Create DimValueResult to store the label, dim ordinal and dim count of a dim in priority queue + */ + private static class DimValueResult { + String dim; + int dimOrd; + int value; + + DimValueResult(String dim, int dimOrd, int value) { + this.dim = dim; + this.dimOrd = dimOrd; + this.value = value; + } + } + + /** + * Create ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top + * children + */ + private static class ChildOrdsResult { + final int aggregatedValue; + final int childCount; + final TopOrdAndIntQueue q; + + ChildOrdsResult(int aggregatedValue, int childCount, TopOrdAndIntQueue q) { + this.aggregatedValue = aggregatedValue; + this.childCount = childCount; + this.q = q; + } } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java index ee5ac799d19..7ef2103ae56 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java @@ -128,6 +128,12 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { "dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n", facets.getTopChildren(10, "Author").toString()); + // test getAllDims + List results = facets.getAllDims(10); + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allTopDimsResults = facets.getTopDims(10, 10); + assertEquals(results, allTopDimsResults); + // Now user drills down on Publish Date/2010: DrillDownQuery q2 = new DrillDownQuery(config); q2.add("Publish Date", "2010"); @@ -242,8 +248,11 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { assertEquals(results, allDimsResults); // test getTopDims(0, 1) - List topDimsResults2 = facets.getTopDims(0, 1); - assertEquals(0, topDimsResults2.size()); + expectThrows( + IllegalArgumentException.class, + () -> { + facets.getTopDims(0, 1); + }); // test getTopDims(1, 0) with topNChildren = 0 expectThrows( @@ -287,6 +296,11 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { // Ask for top 10 labels for any dims that have counts: List results = facets.getAllDims(10); assertTrue(results.isEmpty()); + + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allTopDimsResults = facets.getTopDims(10, 10); + assertEquals(results, allTopDimsResults); + expectThrows( IllegalArgumentException.class, () -> { @@ -642,15 +656,18 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { assertEquals(r.numDocs(), result.value.intValue()); } - // test default implementation of getTopDims + // test override implementation of getTopDims if (allDimsResult.size() > 0) { List topNDimsResult = facets.getTopDims(1, 10); assertEquals(allDimsResult.get(0), topNDimsResult.get(0)); } // test getTopDims(0, 1) - List topDimsResults2 = facets.getTopDims(0, 1); - assertEquals(0, topDimsResults2.size()); + expectThrows( + IllegalArgumentException.class, + () -> { + facets.getTopDims(0, 1); + }); // test getTopDims(1, 0) with topNChildren = 0 expectThrows( @@ -695,10 +712,11 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { assertEquals( "calling getFacetResults twice should return the .equals()=true result", res1, res2); - // test default implementation of getTopDims + // test getTopDims(n, 10) if (res1.size() > 0) { - List topNDimsResult = facets.getTopDims(1, 10); - assertEquals(res1.get(0), topNDimsResult.get(0)); + for (int i = 1; i < res1.size(); i++) { + assertEquals(res1.subList(0, i), facets.getTopDims(i, 10)); + } } iw.close(); @@ -995,11 +1013,12 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { assertEquals(expected, actual); - // test default implementation of getTopDims - - List topNDimsResult = facets.getTopDims(actual.size(), 10); - sortTies(topNDimsResult); - assertEquals(actual, topNDimsResult); + // test getTopDims + if (actual.size() > 0) { + List topNDimsResult = facets.getTopDims(actual.size(), 10); + sortTies(topNDimsResult); + assertEquals(actual, topNDimsResult); + } // Test facet labels for each matching test doc List> actualLabels = getAllTaxonomyFacetLabels(null, tr, fc);