From 7c33f04d3746f855479da5177df8f19b6fdbbe84 Mon Sep 17 00:00:00 2001 From: Yuting Gan <44444710+Yuti-G@users.noreply.github.com> Date: Mon, 28 Mar 2022 15:54:07 -0700 Subject: [PATCH] LUCENE-10325: Add getTopDims functionality to Facets (#747) --- .../java/org/apache/lucene/facet/Facets.java | 11 + .../SortedSetDocValuesFacetCounts.java | 289 +++++++++++++++--- .../lucene/facet/TestDrillSideways.java | 31 ++ .../facet/TestLongValueFacetCounts.java | 15 + .../facet/TestStringValueFacetCounts.java | 5 + .../facet/range/TestRangeFacetCounts.java | 15 + .../TestSortedSetDocValuesFacets.java | 241 ++++++++++++++- .../taxonomy/TestTaxonomyFacetCounts.java | 67 +++- .../TestTaxonomyFacetSumValueSource.java | 33 ++ 9 files changed, 660 insertions(+), 47 deletions(-) diff --git a/lucene/facet/src/java/org/apache/lucene/facet/Facets.java b/lucene/facet/src/java/org/apache/lucene/facet/Facets.java index 2ea14099a18..1c7572d9443 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/Facets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/Facets.java @@ -48,4 +48,15 @@ public abstract class Facets { * indexed, for example depending on the type of document. */ public abstract List getAllDims(int topN) throws IOException; + + /** + * Returns labels for topN dimensions and their topNChildren sorted by the number of + * hits/aggregated values that dimension matched; Results should be the same as calling getAllDims + * and then only using the first topNDims; Sub-classes may want to override this implementation + * with a more efficient one if they are able. + */ + public List getTopDims(int topNDims, int topNChildren) throws IOException { + List allResults = getAllDims(topNChildren); + return allResults.subList(0, Math.min(topNDims, allResults.size())); + } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java index a1314cfb3fa..9de0cbaeb3e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.PrimitiveIterator; import org.apache.lucene.facet.FacetResult; @@ -49,6 +50,7 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; +import org.apache.lucene.util.PriorityQueue; /** * Compute facets counts from previously indexed {@link SortedSetDocValuesFacetField}, without @@ -137,6 +139,10 @@ public class SortedSetDocValuesFacetCounts extends Facets { } } + /** + * Overloaded method to allow getPathResult be called without passing in the dimToChildOrdsResult + * parameter + */ private FacetResult getPathResult( FacetsConfig.DimConfig dimConfig, String dim, @@ -145,11 +151,55 @@ public class SortedSetDocValuesFacetCounts extends Facets { PrimitiveIterator.OfInt childOrds, int topN) throws IOException { + return getPathResult(dimConfig, dim, path, pathOrd, childOrds, topN, null); + } + + /** Returns path results for a dimension */ + private FacetResult getPathResult( + FacetsConfig.DimConfig dimConfig, + String dim, + String[] path, + int pathOrd, + PrimitiveIterator.OfInt childOrds, + int topN, + ChildOrdsResult dimToChildOrdsResult) + throws IOException { + + ChildOrdsResult childOrdsResult; + + // if getTopDims is called, get results from previously stored dimToChildOrdsResult, otherwise + // call getChildOrdsResult to get dimCount, childCount and the queue for the dimension's top + // children + if (dimToChildOrdsResult != null) { + childOrdsResult = dimToChildOrdsResult; + } else { + childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, pathOrd); + } + + if (childOrdsResult.q == null) { + return null; + } + + LabelAndValue[] labelValues = getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q); + + if (dimConfig.hierarchical == true) { + return new FacetResult( + dim, path, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount); + } else { + return new FacetResult( + dim, emptyPath, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount); + } + } + + /** + * Returns SortedSetDocValuesChildOrdsResult that contains results of dimCount, childCount, and + * the queue for the dimension's top children to populate FacetResult in getPathResult. + */ + private ChildOrdsResult getChildOrdsResult( + PrimitiveIterator.OfInt childOrds, int topN, FacetsConfig.DimConfig dimConfig, int pathOrd) { TopOrdAndIntQueue q = null; - int bottomCount = 0; - int dimCount = 0; int childCount = 0; @@ -178,20 +228,9 @@ public class SortedSetDocValuesFacetCounts extends Facets { } } - if (q == null) { - return null; - } - - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - assert ordAndValue != null; - final BytesRef term = dv.lookupOrd(ordAndValue.ord); - String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); - labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); - } - - if (dimConfig.hierarchical == false) { + if (dimConfig.hierarchical == true) { + dimCount = counts[pathOrd]; + } else { // see if dimCount is actually reliable or needs to be reset if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { @@ -200,10 +239,47 @@ public class SortedSetDocValuesFacetCounts extends Facets { dimCount = -1; // dimCount is in accurate at this point, so set it to -1 } } - return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount); - } else { - return new FacetResult(dim, path, counts[pathOrd], labelValues, childCount); } + + return new ChildOrdsResult(dimCount, childCount, q); + } + + /** Returns label values for dims. */ + private LabelAndValue[] getLabelValuesFromTopOrdAndIntQueue(TopOrdAndIntQueue q) + throws IOException { + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; + final BytesRef term = dv.lookupOrd(ordAndValue.ord); + String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); + labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); + } + return labelValues; + } + + /** Returns value/count of a dimension. */ + private int getDimValue( + FacetsConfig.DimConfig dimConfig, + String dim, + int dimOrd, + PrimitiveIterator.OfInt childOrds, + int topN, + HashMap dimToChildOrdsResult) { + + // if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at + // indexing time, return dimCount directly + if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) { + return counts[dimOrd]; + } + + // if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount + ChildOrdsResult childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, dimOrd); + + // if no early termination, store dim and childOrdsResult into a hashmap to avoid calling + // getChildOrdsResult again in getPathResult + dimToChildOrdsResult.put(dim, childOrdsResult); + return childOrdsResult.dimCount; } private void countOneSegment( @@ -366,33 +442,53 @@ public class SortedSetDocValuesFacetCounts extends Facets { return counts[ord]; } + /** + * Overloaded method to allow getFacetResultForDim be called without passing in the + * dimToChildOrdsResult parameter + */ + private FacetResult getFacetResultForDim(String dim, int topNChildren) throws IOException { + return getFacetResultForDim(dim, topNChildren, null); + } + + /** Returns FacetResult for a dimension. */ + private FacetResult getFacetResultForDim( + String dim, int topNChildren, ChildOrdsResult dimToChildOrdsResult) throws IOException { + + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + int dimOrd = dimTree.dimStartOrd; + return getPathResult( + dimConfig, + dim, + emptyPath, + dimOrd, + dimTree.iterator(), + topNChildren, + dimToChildOrdsResult); + } else { + OrdRange ordRange = state.getOrdRange(dim); + int dimOrd = ordRange.start; + PrimitiveIterator.OfInt childIt = ordRange.iterator(); + if (dimConfig.multiValued && dimConfig.requireDimCount) { + // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed + // the dimension and we need to skip past it so the iterator is positioned on the first + // child: + childIt.next(); + } + return getPathResult( + dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, dimToChildOrdsResult); + } + } + @Override public List getAllDims(int topN) throws IOException { - List results = new ArrayList<>(); for (String dim : state.getDims()) { - FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); - if (dimConfig.hierarchical) { - DimTree dimTree = state.getDimTree(dim); - int dimOrd = dimTree.dimStartOrd; - FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN); - if (fr != null) { - results.add(fr); - } - } else { - OrdRange ordRange = state.getOrdRange(dim); - int dimOrd = ordRange.start; - PrimitiveIterator.OfInt childIt = ordRange.iterator(); - if (dimConfig.multiValued && dimConfig.requireDimCount) { - // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed - // the dimension and we need to skip past it so the iterator is positioned on the first - // child: - childIt.next(); - } - FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN); - if (fr != null) { - results.add(fr); - } + FacetResult factResult = getFacetResultForDim(dim, topN); + if (factResult != null) { + results.add(factResult); } } @@ -411,7 +507,114 @@ public class SortedSetDocValuesFacetCounts extends Facets { } } }); - return results; } + + @Override + public List getTopDims(int topNDims, int topNChildren) throws IOException { + if (topNDims <= 0 || topNChildren <= 0) { + throw new IllegalArgumentException("topN must be > 0"); + } + + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan(DimValueResult a, DimValueResult b) { + if (a.value > b.value) { + return false; + } else if (a.value < b.value) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + HashMap dimToChildOrdsResult = new HashMap<>(); + int dimCount; + + for (String dim : state.getDims()) { + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + int dimOrd = dimTree.dimStartOrd; + // get dim value + dimCount = + getDimValue( + dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, dimToChildOrdsResult); + } else { + OrdRange ordRange = state.getOrdRange(dim); + int dimOrd = ordRange.start; + PrimitiveIterator.OfInt childIt = ordRange.iterator(); + if (dimConfig.multiValued && dimConfig.requireDimCount) { + // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed + // the dimension and we need to skip past it so the iterator is positioned on the first + // child: + childIt.next(); + } + dimCount = getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, dimToChildOrdsResult); + } + + if (dimCount != 0) { + // use priority queue to store DimValueResult for topNDims + if (pq.size() < topNDims) { + pq.add(new DimValueResult(dim, dimCount)); + } else { + if (dimCount > pq.top().value + || (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { + DimValueResult bottomDim = pq.top(); + bottomDim.dim = dim; + bottomDim.value = dimCount; + pq.updateTop(); + } + } + } + } + + // get FacetResult for topNDims + int resultSize = pq.size(); + FacetResult[] results = new FacetResult[resultSize]; + + while (pq.size() > 0) { + DimValueResult dimValueResult = pq.pop(); + FacetResult facetResult = + getFacetResultForDim( + dimValueResult.dim, topNChildren, dimToChildOrdsResult.get(dimValueResult.dim)); + resultSize--; + results[resultSize] = facetResult; + } + return Arrays.asList(results); + } + + /** + * Creates ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top + * children + */ + private static class ChildOrdsResult { + final int dimCount; + final int childCount; + final TopOrdAndIntQueue q; + + ChildOrdsResult(int dimCount, int childCount, TopOrdAndIntQueue q) { + this.dimCount = dimCount; + this.childCount = childCount; + this.q = q; + } + } + + /** + * Creates DimValueResult to store the label and value of dim in order to sort by these two + * fields. + */ + private static class DimValueResult { + String dim; + int value; + + DimValueResult(String dim, int value) { + this.dim = dim; + this.value = value; + } + } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java index 857fe3f9e3d..1fe741466e6 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java @@ -358,6 +358,19 @@ public class TestDrillSideways extends FacetTestCase { "dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n", allResults.get(1).toString()); + // test default implementation of getTopDims + List topNDimsResult = r.facets.getTopDims(2, 1); + assertEquals(2, topNDimsResult.size()); + assertEquals( + "dim=Author path=[] value=5 childCount=4\n Lisa (2)\n", topNDimsResult.get(0).toString()); + assertEquals( + "dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n", + topNDimsResult.get(1).toString()); + + // test getTopDims(0, 1) + List topDimsResults2 = r.facets.getTopDims(0, 1); + assertEquals(0, topDimsResults2.size()); + // More interesting case: drill-down on two fields ddq = new DrillDownQuery(config); ddq.add("Author", "Lisa"); @@ -581,6 +594,17 @@ public class TestDrillSideways extends FacetTestCase { "dim=Publish Date path=[] value=3 childCount=2\n 2010 (2)\n 2012 (1)\n", allResults.get(1).toString()); + // test default implementation of getTopDims + List topNDimsResult = r.facets.getTopDims(1, 2); + assertEquals(1, topNDimsResult.size()); + assertEquals( + "dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Susan (1)\n", + topNDimsResult.get(0).toString()); + + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allDimsResults = r.facets.getTopDims(10, 10); + assertEquals(allResults, allDimsResults); + // More interesting case: drill-down on two fields ddq = new DrillDownQuery(config); ddq.add("Author", "Lisa"); @@ -1843,6 +1867,13 @@ public class TestDrillSideways extends FacetTestCase { "dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n", allResults.get(0).toString()); + // test default implementation of getTopDims + List topNDimsResult = facets.getTopDims(1, 2); + assertEquals(1, topNDimsResult.size()); + assertEquals( + "dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Susan (1)\n", + topNDimsResult.get(0).toString()); + // More interesting case: drill-down on two fields ddq = new DrillDownQuery(config); ddq.add("Author", "Lisa"); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java index b61da63510a..1e4d2be5eaa 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java @@ -155,6 +155,21 @@ public class TestLongValueFacetCounts extends LuceneTestCase { "dim=field path=[] value=101 childCount=6\n 0 (20)\n 1 (20)\n 2 (20)\n " + "3 (20)\n 4 (20)\n 9223372036854775807 (1)\n", result.get(0).toString()); + + // test default implementation of getTopDims + List getTopDimResult = facets.getTopDims(1, 1); + assertEquals(1, getTopDimResult.size()); + assertEquals( + "dim=field path=[] value=101 childCount=6\n 0 (20)\n", getTopDimResult.get(0).toString()); + + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allDimsResults = facets.getTopDims(10, 10); + assertEquals(result, allDimsResults); + + // test getTopDims(0, 1) + List topDimsResults2 = facets.getTopDims(0, 1); + assertEquals(0, topDimsResults2.size()); + r.close(); d.close(); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java index 765c6759260..474d39dc11b 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestStringValueFacetCounts.java @@ -387,6 +387,11 @@ public class TestStringValueFacetCounts extends FacetTestCase { assertEquals(1, allDims.size()); assertEquals(facetResult, allDims.get(0)); + // test default implementation of getTopDims + List topNDimsResult = facets.getTopDims(2, topN); + assertEquals(1, topNDimsResult.size()); + assertEquals(facetResult, topNDimsResult.get(0)); + // This is a little strange, but we request all labels at this point so that when we // secondarily sort by label value in order to compare to the expected results, we have // all the values. See LUCENE-9991: diff --git a/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java index 758ca3cb673..d78160b2c22 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeFacetCounts.java @@ -243,6 +243,21 @@ public class TestRangeFacetCounts extends FacetTestCase { "dim=field path=[] value=22 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (1)\n", result.get(0).toString()); + // test getAllDims(1) + List test1Child = facets.getAllDims(1); + assertEquals(1, test1Child.size()); + assertEquals( + "dim=field path=[] value=22 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (1)\n", + test1Child.get(0).toString()); + + // test default implementation of getTopDims + List topNDimsResult = facets.getTopDims(1, 1); + assertEquals(test1Child, topNDimsResult); + + // test getTopDims(0, 1) + List topDimsResults2 = facets.getTopDims(0, 1); + assertEquals(0, topDimsResults2.size()); + r.close(); d.close(); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java index 355e73ddc1c..ee9a9c8ae24 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java @@ -104,6 +104,41 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { "dim=b path=[] value=2 childCount=2\n buzz (2)\n baz (1)\n", facets.getTopChildren(10, "b").toString()); + // test getAllDims + List results = facets.getAllDims(10); + assertEquals(2, results.size()); + assertEquals( + "dim=b path=[] value=2 childCount=2\n buzz (2)\n baz (1)\n", + results.get(0).toString()); + assertEquals( + "dim=a path=[] value=-1 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n", + results.get(1).toString()); + + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allDimsResults = facets.getTopDims(10, 10); + assertEquals(results, allDimsResults); + + // test getTopDims(2, 1) + List topDimsResults = facets.getTopDims(2, 1); + assertEquals(2, topDimsResults.size()); + assertEquals( + "dim=b path=[] value=2 childCount=2\n buzz (2)\n", topDimsResults.get(0).toString()); + assertEquals( + "dim=a path=[] value=-1 childCount=3\n foo (2)\n", topDimsResults.get(1).toString()); + + // test getAllDims + List results2 = facets.getAllDims(1); + assertEquals(2, results2.size()); + assertEquals( + "dim=b path=[] value=2 childCount=2\n buzz (2)\n", results2.get(0).toString()); + + // test getTopDims(1, 1) + List topDimsResults1 = facets.getTopDims(1, 1); + assertEquals(1, topDimsResults1.size()); + assertEquals( + "dim=b path=[] value=2 childCount=2\n buzz (2)\n", + topDimsResults1.get(0).toString()); + // DrillDown: DrillDownQuery q = new DrillDownQuery(config); q.add("a", "foo"); @@ -117,6 +152,149 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + // test tricky combinations of the three config: MultiValued, Hierarchical, and RequireDimCount of + // a dim + public void testCombinationsOfConfig() throws Exception { + FacetsConfig config = new FacetsConfig(); + + // case 1: dimension "a" is hierarchical and non-multiValued + // expect returns counts[pathOrd] + config.setMultiValued("a", false); + config.setHierarchical("a", true); + + // case 2: dimension "b" is hierarchical and multiValued and setRequireDimCount = true + // expect returns counts[pathOrd] + config.setMultiValued("b", true); + config.setHierarchical("b", true); + config.setRequireDimCount("b", true); + + // case 3: dimension "c" is hierarchical and multiValued and setRequireDimCount != true + // expect always returns counts[pathOrd] for Hierarchical = true + config.setMultiValued("c", true); + config.setHierarchical("c", true); + + // case 4: dimension "d" is non-hierarchical but multiValued and setRequireDimCount = true + // expect returns counts[pathOrd] + config.setMultiValued("d", true); + config.setHierarchical("d", false); + config.setRequireDimCount("d", true); + + // case 4: dimension "e" that is non-hierarchical and multiValued and setRequireDimCount = false + // expect returns -1, this is the only case that we reset dimCount to -1 + config.setMultiValued("e", true); + config.setHierarchical("e", false); + config.setRequireDimCount("e", false); + + // case 5: dimension "f" that it is non-hierarchical and non-multiValued and expect returns + // counts[pathOrd] + config.setMultiValued("f", false); + config.setHierarchical("f", false); + + // case 6: expect returns counts[pathOrd] for dims with setHierarchical = true + config.setHierarchical("g", true); + + // case 7: expect returns counts[pathOrd] for dims with setHierarchical = true + config.setHierarchical("g-2", false); + + // case 8: expect returns counts[pathOrd] for dims with setHierarchical = true + config.setRequireDimCount("h", true); + config.setMultiValued("h", true); + + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + doc.add(new SortedSetDocValuesFacetField("b", "bar")); + doc.add(new SortedSetDocValuesFacetField("c", "zoo")); + doc.add(new SortedSetDocValuesFacetField("d", "baz")); + doc.add(new SortedSetDocValuesFacetField("e", "buzz")); + doc.add(new SortedSetDocValuesFacetField("f", "buzze")); + doc.add(new SortedSetDocValuesFacetField("g", "buzzel")); + doc.add(new SortedSetDocValuesFacetField("g-2", "buzzell")); + doc.add(new SortedSetDocValuesFacetField("h", "buzzele")); + writer.addDocument(config.build(doc)); + + // NRT open + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + ExecutorService exec = randomExecutorServiceOrNull(); + try { + Facets facets = getAllFacets(searcher, state, exec); + assertEquals( + "dim=a path=[] value=1 childCount=1\n foo (1)\n", + facets.getTopChildren(10, "a").toString()); + // value for dim b should be 1 since it's multivalued but _does_ require dim counts: + assertEquals( + "dim=b path=[] value=1 childCount=1\n bar (1)\n", + facets.getTopChildren(10, "b").toString()); + assertEquals( + "dim=c path=[] value=1 childCount=1\n zoo (1)\n", + facets.getTopChildren(10, "c").toString()); + assertEquals( + "dim=d path=[] value=1 childCount=1\n baz (1)\n", + facets.getTopChildren(10, "d").toString()); + // value for dim e should be -1 since it's multivalued but doesn't require dim counts: + assertEquals( + "dim=e path=[] value=-1 childCount=1\n buzz (1)\n", + facets.getTopChildren(10, "e").toString()); + assertEquals( + "dim=f path=[] value=1 childCount=1\n buzze (1)\n", + facets.getTopChildren(10, "f").toString()); + assertEquals( + "dim=g path=[] value=1 childCount=1\n buzzel (1)\n", + facets.getTopChildren(10, "g").toString()); + assertEquals( + "dim=g-2 path=[] value=1 childCount=1\n buzzell (1)\n", + facets.getTopChildren(10, "g-2").toString()); + assertEquals( + "dim=h path=[] value=1 childCount=1\n buzzele (1)\n", + facets.getTopChildren(10, "h").toString()); + + // test getAllDims + List results = facets.getAllDims(10); + assertEquals(9, results.size()); + assertEquals( + "dim=a path=[] value=1 childCount=1\n foo (1)\n", results.get(0).toString()); + assertEquals( + "dim=b path=[] value=1 childCount=1\n bar (1)\n", results.get(1).toString()); + assertEquals( + "dim=c path=[] value=1 childCount=1\n zoo (1)\n", results.get(2).toString()); + assertEquals( + "dim=d path=[] value=1 childCount=1\n baz (1)\n", results.get(3).toString()); + assertEquals( + "dim=f path=[] value=1 childCount=1\n buzze (1)\n", results.get(4).toString()); + assertEquals( + "dim=g path=[] value=1 childCount=1\n buzzel (1)\n", results.get(5).toString()); + assertEquals( + "dim=g-2 path=[] value=1 childCount=1\n buzzell (1)\n", results.get(6).toString()); + assertEquals( + "dim=h path=[] value=1 childCount=1\n buzzele (1)\n", results.get(7).toString()); + assertEquals( + "dim=e path=[] value=-1 childCount=1\n buzz (1)\n", results.get(8).toString()); + + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allTopDimsResults = facets.getTopDims(10, 10); + assertEquals(results, allTopDimsResults); + + // test getTopDims(n, 10) + if (allTopDimsResults.size() > 0) { + for (int i = 1; i < results.size(); i++) { + assertEquals(results.subList(0, i), facets.getTopDims(i, 10)); + } + } + + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + public void testBasicHierarchical() throws Exception { FacetsConfig config = new FacetsConfig(); config.setMultiValued("a", true); @@ -702,6 +880,11 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { doc.add(new SortedSetDocValuesFacetField("a", "foo3")); doc.add(new SortedSetDocValuesFacetField("b", "bar2")); doc.add(new SortedSetDocValuesFacetField("c", "baz1")); + doc.add(new SortedSetDocValuesFacetField("d", "biz1")); + writer.addDocument(config.build(doc)); + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("d", "biz2")); writer.addDocument(config.build(doc)); // NRT open @@ -719,7 +902,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { // Ask for top 10 labels for any dims that have counts: List results = facets.getAllDims(10); - assertEquals(3, results.size()); + assertEquals(4, results.size()); assertEquals( "dim=a path=[] value=3 childCount=3\n foo1 (1)\n foo2 (1)\n foo3 (1)\n", results.get(0).toString()); @@ -727,7 +910,42 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { "dim=b path=[] value=2 childCount=2\n bar1 (1)\n bar2 (1)\n", results.get(1).toString()); assertEquals( - "dim=c path=[] value=1 childCount=1\n baz1 (1)\n", results.get(2).toString()); + "dim=d path=[] value=2 childCount=2\n biz1 (1)\n biz2 (1)\n", + results.get(2).toString()); + assertEquals( + "dim=c path=[] value=1 childCount=1\n baz1 (1)\n", results.get(3).toString()); + + // test getAllDims with topN = 1, sort by dim names when values are equal + List top1results = facets.getAllDims(1); + assertEquals(4, results.size()); + assertEquals( + "dim=a path=[] value=3 childCount=3\n foo1 (1)\n", top1results.get(0).toString()); + assertEquals( + "dim=b path=[] value=2 childCount=2\n bar1 (1)\n", top1results.get(1).toString()); + assertEquals( + "dim=d path=[] value=2 childCount=2\n biz1 (1)\n", top1results.get(2).toString()); + assertEquals( + "dim=c path=[] value=1 childCount=1\n baz1 (1)\n", top1results.get(3).toString()); + + // test getTopDims(1, 1) + List topDimsResults1 = facets.getTopDims(1, 1); + assertEquals(1, topDimsResults1.size()); + assertEquals( + "dim=a path=[] value=3 childCount=3\n foo1 (1)\n", + topDimsResults1.get(0).toString()); + + // test top 2 dims that have the same counts, expect to sort by dim names + List topDimsResults2 = facets.getTopDims(3, 2); + assertEquals(3, topDimsResults2.size()); + assertEquals( + "dim=a path=[] value=3 childCount=3\n foo1 (1)\n foo2 (1)\n", + topDimsResults2.get(0).toString()); + assertEquals( + "dim=b path=[] value=2 childCount=2\n bar1 (1)\n bar2 (1)\n", + topDimsResults2.get(1).toString()); + assertEquals( + "dim=d path=[] value=2 childCount=2\n biz1 (1)\n biz2 (1)\n", + topDimsResults2.get(2).toString()); Collection resources = state.getChildResources(); assertTrue(state.toString().contains(FacetsConfig.DEFAULT_INDEX_FIELD_NAME)); @@ -795,6 +1013,12 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { assertEquals( "dim=e path=[] value=1 childCount=1\n biz (1)\n", results.get(1).toString()); + // test getTopDims(1, 1) + List topDimsResults1 = facets.getTopDims(1, 1); + assertEquals(1, topDimsResults1.size()); + assertEquals( + "dim=d path=[] value=2 childCount=1\n foo (2)\n", results.get(0).toString()); + Collection resources = state.getChildResources(); assertTrue(state.toString().contains(FacetsConfig.DEFAULT_INDEX_FIELD_NAME)); if (searcher.getIndexReader().leaves().size() > 1) { @@ -1012,6 +1236,12 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { // sortTies(actual); assertEquals(expected, actual); + + // test getTopDims(1, 10) + if (actual.size() > 0) { + List topDimsResults1 = facets.getTopDims(1, 10); + assertEquals(actual.get(0), topDimsResults1.get(0)); + } } } finally { if (exec != null) exec.shutdownNow(); @@ -1221,6 +1451,13 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { assertEquals(expectedAllDims, actualAllDims); + // test getTopDims(n, 10) + if (actualAllDims.size() > 0) { + for (int i = 1; i < actualAllDims.size(); i++) { + assertEquals(actualAllDims.subList(0, i), facets.getTopDims(i, 10)); + } + } + // Dfs through top children for (FacetResult dimResult : actualAllDims) { if (config.getDimConfig(dimResult.dim).hierarchical) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java index 82a1be52c74..07faa52202d 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java @@ -163,6 +163,7 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { Document doc = new Document(); doc.add(new FacetField("a", "foo1")); + doc.add(new FacetField("b", "aar1")); writer.addDocument(config.build(taxoWriter, doc)); if (random().nextBoolean()) { @@ -201,9 +202,41 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { "dim=a path=[] value=3 childCount=3\n foo1 (1)\n foo2 (1)\n foo3 (1)\n", results.get(0).toString()); assertEquals( - "dim=b path=[] value=2 childCount=2\n bar1 (1)\n bar2 (1)\n", results.get(1).toString()); + "dim=b path=[] value=3 childCount=3\n aar1 (1)\n bar1 (1)\n bar2 (1)\n", + results.get(1).toString()); assertEquals("dim=c path=[] value=1 childCount=1\n baz1 (1)\n", results.get(2).toString()); + // test getAllDims with topN = 1, sort by dim names when values are equal + List top1results = facets.getAllDims(1); + + assertEquals(3, results.size()); + assertEquals("dim=a path=[] value=3 childCount=3\n foo3 (1)\n", top1results.get(0).toString()); + assertEquals("dim=b path=[] value=3 childCount=3\n bar2 (1)\n", top1results.get(1).toString()); + assertEquals("dim=c path=[] value=1 childCount=1\n baz1 (1)\n", top1results.get(2).toString()); + + // test default implementation of getTopDims + List topNDimsResult = facets.getTopDims(2, 1); + assertEquals(2, topNDimsResult.size()); + assertEquals( + "dim=a path=[] value=3 childCount=3\n foo3 (1)\n", topNDimsResult.get(0).toString()); + assertEquals( + "dim=b path=[] value=3 childCount=3\n bar2 (1)\n", topNDimsResult.get(1).toString()); + + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allDimsResults = facets.getTopDims(10, 10); + assertEquals(results, allDimsResults); + + // test getTopDims(0, 1) + List topDimsResults2 = facets.getTopDims(0, 1); + assertEquals(0, topDimsResults2.size()); + + // test getTopDims(1, 0) with topNChildren = 0 + expectThrows( + IllegalArgumentException.class, + () -> { + facets.getTopDims(1, 0); + }); + writer.close(); IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, taxoDir, dir); } @@ -590,10 +623,28 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { Facets facets = getAllFacets(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, newSearcher(r), taxoReader, config); - for (FacetResult result : facets.getAllDims(10)) { + List allDimsResult = facets.getAllDims(10); + for (FacetResult result : allDimsResult) { assertEquals(r.numDocs(), result.value.intValue()); } + // test default implementation of getTopDims + if (allDimsResult.size() > 0) { + List topNDimsResult = facets.getTopDims(1, 10); + assertEquals(allDimsResult.get(0), topNDimsResult.get(0)); + } + + // test getTopDims(0, 1) + List topDimsResults2 = facets.getTopDims(0, 1); + assertEquals(0, topDimsResults2.size()); + + // test getTopDims(1, 0) with topNChildren = 0 + expectThrows( + IllegalArgumentException.class, + () -> { + facets.getTopDims(1, 0); + }); + iw.close(); IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir); } @@ -623,6 +674,12 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { assertEquals( "calling getFacetResults twice should return the .equals()=true result", res1, res2); + // test default implementation of getTopDims + if (res1.size() > 0) { + List topNDimsResult = facets.getTopDims(1, 10); + assertEquals(res1.get(0), topNDimsResult.get(0)); + } + iw.close(); IOUtils.close(taxoWriter, taxoReader, taxoDir, r, indexDir); } @@ -809,6 +866,12 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { assertEquals(expected, actual); + // test default implementation of getTopDims + + List topNDimsResult = facets.getTopDims(actual.size(), 10); + sortTies(topNDimsResult); + assertEquals(actual, topNDimsResult); + // Test facet labels for each matching test doc List> actualLabels = getAllTaxonomyFacetLabels(null, tr, fc); assertEquals(expectedLabels.size(), actualLabels.size()); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetSumValueSource.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetSumValueSource.java index 654a95bc2b2..cdb624b1cd6 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetSumValueSource.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetSumValueSource.java @@ -195,6 +195,29 @@ public class TestTaxonomyFacetSumValueSource extends FacetTestCase { assertEquals( "dim=c path=[] value=30.0 childCount=1\n baz1 (30.0)\n", results.get(2).toString()); + // test default implementation of getTopDims + List topNDimsResult = facets.getTopDims(2, 1); + assertEquals(2, topNDimsResult.size()); + assertEquals( + "dim=a path=[] value=60.0 childCount=3\n foo3 (30.0)\n", topNDimsResult.get(0).toString()); + assertEquals( + "dim=b path=[] value=50.0 childCount=2\n bar2 (30.0)\n", topNDimsResult.get(1).toString()); + + // test getTopDims(10, 10) and expect same results from getAllDims(10) + List allDimsResults = facets.getTopDims(10, 10); + assertEquals(results, allDimsResults); + + // test getTopDims(0, 1) + List topDimsResults2 = facets.getTopDims(0, 1); + assertEquals(0, topDimsResults2.size()); + + // test getTopDims(1, 0) with topNChildren = 0 + expectThrows( + IllegalArgumentException.class, + () -> { + facets.getTopDims(1, 0); + }); + IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir); } @@ -237,6 +260,10 @@ public class TestTaxonomyFacetSumValueSource extends FacetTestCase { List results = facets.getAllDims(10); assertTrue(results.isEmpty()); + // test default implementation of getTopDims + List topDimsResults = facets.getTopDims(10, 10); + assertTrue(topDimsResults.isEmpty()); + expectThrows( IllegalArgumentException.class, () -> { @@ -510,6 +537,12 @@ public class TestTaxonomyFacetSumValueSource extends FacetTestCase { List actual = facets.getAllDims(10); + // test default implementation of getTopDims + if (actual.size() > 0) { + List topDimsResults1 = facets.getTopDims(1, 10); + assertEquals(actual.get(0), topDimsResults1.get(0)); + } + // Messy: fixup ties sortTies(actual);