From 8db1e41fc072920dc1c6554f5859d4dd623dace6 Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Sun, 29 May 2022 01:26:51 -0700 Subject: [PATCH] LUCENE-10585: Scrub copy/paste code in the facets module and attempt to simplify a bit (#915) --- lucene/CHANGES.txt | 4 +- .../java/org/apache/lucene/facet/Facets.java | 10 +- .../AbstractSortedSetDocValueFacetCounts.java | 349 ++++++++++++++++ ...ncurrentSortedSetDocValuesFacetCounts.java | 393 +----------------- .../SortedSetDocValuesFacetCounts.java | 392 +---------------- .../facet/taxonomy/FloatTaxonomyFacets.java | 183 ++++---- .../facet/taxonomy/IntTaxonomyFacets.java | 216 +++++----- .../lucene/facet/taxonomy/TaxonomyFacets.java | 3 +- 8 files changed, 544 insertions(+), 1006 deletions(-) create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4858e3082e7..603edacc0b8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -75,7 +75,9 @@ New Features Improvements --------------------- -(No changes) + +* LUCENE-10585: Facet module code cleanup (copy/paste scrubbing, simplification and some very minor + optimization tweaks). (Greg Miller) Optimizations --------------------- diff --git a/lucene/facet/src/java/org/apache/lucene/facet/Facets.java b/lucene/facet/src/java/org/apache/lucene/facet/Facets.java index 7ceb02518a7..a11dd6e8a64 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/Facets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/Facets.java @@ -51,9 +51,13 @@ public abstract class Facets { /** * Returns labels for topN dimensions and their topNChildren sorted by the number of - * hits/aggregated values that dimension matched; Results should be the same as calling getAllDims - * and then only using the first topNDims; Sub-classes may want to override this implementation - * with a more efficient one if they are able. + * hits/aggregated values that dimension matched. Results should be the same as calling getAllDims + * and then only using the first topNDims. Note that dims should be configured as requiring dim + * counts if using this functionality to ensure accurate counts are available (see: {@link + * FacetsConfig#setRequireDimCount(String, boolean)}). + * + *

Sub-classes may want to override this implementation with a more efficient one if they are + * able. */ public List getTopDims(int topNDims, int topNChildren) throws IOException { List allResults = getAllDims(topNChildren); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java new file mode 100644 index 00000000000..d0bbf7699bb --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/AbstractSortedSetDocValueFacetCounts.java @@ -0,0 +1,349 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.sortedset; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.PrimitiveIterator; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.Facets; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.FacetsConfig.DimConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.TopOrdAndIntQueue; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.DimTree; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; + +/** Base class for SSDV faceting implementations. */ +abstract class AbstractSortedSetDocValueFacetCounts extends Facets { + + private static final Comparator FACET_RESULT_COMPARATOR = + new Comparator<>() { + @Override + public int compare(FacetResult a, FacetResult b) { + if (a.value.intValue() > b.value.intValue()) { + return -1; + } else if (b.value.intValue() > a.value.intValue()) { + return 1; + } else { + return a.dim.compareTo(b.dim); + } + } + }; + + final SortedSetDocValuesReaderState state; + final FacetsConfig stateConfig; + final SortedSetDocValues dv; + final String field; + + AbstractSortedSetDocValueFacetCounts(SortedSetDocValuesReaderState state) throws IOException { + this.state = state; + this.field = state.getField(); + this.stateConfig = state.getFacetsConfig(); + this.dv = state.getDocValues(); + } + + @Override + public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { + validateTopN(topN); + TopChildrenForPath topChildrenForPath = getTopChildrenForPath(topN, dim, path); + return createFacetResult(topChildrenForPath, dim, path); + } + + @Override + public Number getSpecificValue(String dim, String... path) throws IOException { + if (path.length != 1) { + throw new IllegalArgumentException("path must be length=1"); + } + int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); + if (ord < 0) { + return -1; + } + + return getCount(ord); + } + + @Override + public List getAllDims(int topN) throws IOException { + validateTopN(topN); + List results = new ArrayList<>(); + for (String dim : state.getDims()) { + TopChildrenForPath topChildrenForPath = getTopChildrenForPath(topN, dim); + FacetResult facetResult = createFacetResult(topChildrenForPath, dim); + if (facetResult != null) { + results.add(facetResult); + } + } + + // Sort by highest count: + results.sort(FACET_RESULT_COMPARATOR); + return results; + } + + @Override + public List getTopDims(int topNDims, int topNChildren) throws IOException { + validateTopN(topNDims); + validateTopN(topNChildren); + + // Creates priority queue to store top dimensions and sort by their aggregated values/hits and + // string values. + PriorityQueue pq = + new PriorityQueue<>(topNDims) { + @Override + protected boolean lessThan(DimValue a, DimValue b) { + if (a.value > b.value) { + return false; + } else if (a.value < b.value) { + return true; + } else { + return a.dim.compareTo(b.dim) > 0; + } + } + }; + + // Keep track of intermediate results, if we compute them, so we can reuse them later: + Map intermediateResults = null; + + for (String dim : state.getDims()) { + DimConfig dimConfig = stateConfig.getDimConfig(dim); + int dimCount; + if (dimConfig.hierarchical) { + // For hierarchical dims, we directly index each level of the ancestry path (i.e., we + // "rollup" at indexing time), meaning we can directly access accurate dim counts without + // needing to rollup the descendant paths: + int dimOrd = state.getDimTree(dim).dimStartOrd; + dimCount = getCount(dimOrd); + } else { + OrdRange ordRange = state.getOrdRange(dim); + int dimOrd = ordRange.start; + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + // If a dim is configured as multi-valued and requires dim count, we index dim counts + // directly, so we can access accurate counts without needing to rollup children: + dimCount = getCount(dimOrd); + } else { + // If a dim is configured as multi-valued but _not_ requiring dim count, we have no + // way to get accurate counts. We use -1 to indicate this: + dimCount = -1; + } + } else { + // If a dim is single-valued, we must aggregate child counts to get accurate dim counts. + // We don't index the dim counts directly: + // TODO: If getTopDims becomes a common use-case, we could consider always indexing dim + // counts to optimize this path. + PrimitiveIterator.OfInt childIt = ordRange.iterator(); + TopChildrenForPath topChildrenForPath = + computeTopChildren(childIt, topNChildren, dimConfig, dimOrd); + if (intermediateResults == null) { + intermediateResults = new HashMap<>(); + } + intermediateResults.put(dim, topChildrenForPath); + dimCount = topChildrenForPath.pathCount; + } + } + + if (dimCount != 0) { + if (pq.size() < topNDims) { + pq.add(new DimValue(dim, dimCount)); + } else { + if (dimCount > pq.top().value + || (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { + DimValue bottomDim = pq.top(); + bottomDim.dim = dim; + bottomDim.value = dimCount; + pq.updateTop(); + } + } + } + } + + int resultSize = pq.size(); + FacetResult[] results = new FacetResult[resultSize]; + + while (pq.size() > 0) { + DimValue dimValue = pq.pop(); + assert dimValue != null; + TopChildrenForPath topChildrenForPath = null; + if (intermediateResults != null) { + topChildrenForPath = intermediateResults.get(dimValue.dim); + } + if (topChildrenForPath == null) { + topChildrenForPath = getTopChildrenForPath(topNChildren, dimValue.dim); + } + FacetResult facetResult = createFacetResult(topChildrenForPath, dimValue.dim); + // should not be null since only dims with non-zero values were considered earlier + assert facetResult != null; + resultSize--; + results[resultSize] = facetResult; + } + return Arrays.asList(results); + } + + /** Retrieve the count for a specified ordinal. */ + abstract int getCount(int ord); + + /** + * Compute the top-n children for the given path and iterator of all immediate children of the + * path. This returns an intermediate result that does the minimal required work, avoiding the + * cost of looking up string labels, etc. + */ + TopChildrenForPath computeTopChildren( + PrimitiveIterator.OfInt childOrds, int topN, DimConfig dimConfig, int pathOrd) { + TopOrdAndIntQueue q = null; + int bottomCount = 0; + int pathCount = 0; + int childCount = 0; + + TopOrdAndIntQueue.OrdAndValue reuse = null; + while (childOrds.hasNext()) { + int ord = childOrds.next(); + int count = getCount(ord); + if (count > 0) { + pathCount += count; + childCount++; + if (count > bottomCount) { + if (reuse == null) { + reuse = new TopOrdAndIntQueue.OrdAndValue(); + } + reuse.ord = ord; + reuse.value = count; + if (q == null) { + // Lazy init, so we don't create this for the + // sparse case unnecessarily + q = new TopOrdAndIntQueue(topN); + } + reuse = q.insertWithOverflow(reuse); + if (q.size() == topN) { + bottomCount = q.top().value; + } + } + } + } + + if (dimConfig.hierarchical) { + pathCount = getCount(pathOrd); + } else { + // see if pathCount is actually reliable or needs to be reset + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + pathCount = getCount(pathOrd); + } else { + pathCount = -1; // pathCount is inaccurate at this point, so set it to -1 + } + } + } + + return new TopChildrenForPath(pathCount, childCount, q); + } + + /** + * Determine the top-n children for a specified dimension + path. Results are in an intermediate + * form. + */ + TopChildrenForPath getTopChildrenForPath(int topN, String dim, String... path) + throws IOException { + FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); + + // Determine the path ord and resolve an iterator to its immediate children. The logic for this + // depends on whether-or-not the dimension is configured as hierarchical: + final int pathOrd; + final PrimitiveIterator.OfInt childIterator; + if (dimConfig.hierarchical) { + DimTree dimTree = state.getDimTree(dim); + if (path.length > 0) { + pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); + } else { + // If there's no path, this is a little more efficient to just look up the dim: + pathOrd = dimTree.dimStartOrd; + } + if (pathOrd < 0) { + // path was never indexed + return null; + } + childIterator = dimTree.iterator(pathOrd); + } else { + if (path.length > 0) { + throw new IllegalArgumentException( + "Field is not configured as hierarchical, path should be 0 length"); + } + OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + // means dimension was never indexed + return null; + } + pathOrd = ordRange.start; + childIterator = ordRange.iterator(); + if (dimConfig.multiValued && dimConfig.requireDimCount) { + // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed + // the dimension and we need to skip past it so the iterator is positioned on the first + // child: + childIterator.next(); + } + } + + // Compute the actual results: + return computeTopChildren(childIterator, topN, dimConfig, pathOrd); + } + + /** + * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work + * of resolving ordinals -> labels, etc. Will return null if there are no children. + */ + FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) + throws IOException { + // If the intermediate result is null or there are no children, we return null: + if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { + return null; + } + + TopOrdAndIntQueue q = topChildrenForPath.q; + assert q != null; + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; + final BytesRef term = dv.lookupOrd(ordAndValue.ord); + String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); + labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); + } + + return new FacetResult( + dim, path, topChildrenForPath.pathCount, labelValues, topChildrenForPath.childCount); + } + + /** Intermediate result to store top children for a given path before resolving labels, etc. */ + record TopChildrenForPath(int pathCount, int childCount, TopOrdAndIntQueue q) {} + + static final class DimValue { + String dim; + int value; + + DimValue(String dim, int value) { + this.dim = dim; + this.value = value; + } + } +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java index 59cef70ebfb..346851e2f1b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java @@ -19,25 +19,15 @@ package org.apache.lucene.facet.sortedset; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; import java.util.List; -import java.util.PrimitiveIterator; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicIntegerArray; -import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetUtils; -import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsCollector.MatchingDocs; -import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.LabelAndValue; -import org.apache.lucene.facet.TopOrdAndIntQueue; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; @@ -52,27 +42,19 @@ import org.apache.lucene.search.ConjunctionUtils; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LongValues; -import org.apache.lucene.util.PriorityQueue; /** * Like {@link SortedSetDocValuesFacetCounts}, but aggregates counts concurrently across segments. * * @lucene.experimental */ -public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { +public class ConcurrentSortedSetDocValuesFacetCounts extends AbstractSortedSetDocValueFacetCounts { final ExecutorService exec; - final SortedSetDocValuesReaderState state; - final FacetsConfig stateConfig; - final SortedSetDocValues dv; - final String field; final AtomicIntegerArray counts; - private static final String[] emptyPath = new String[0]; - /** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */ public ConcurrentSortedSetDocValuesFacetCounts( SortedSetDocValuesReaderState state, ExecutorService exec) @@ -84,11 +66,8 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { public ConcurrentSortedSetDocValuesFacetCounts( SortedSetDocValuesReaderState state, FacetsCollector hits, ExecutorService exec) throws IOException, InterruptedException { - this.state = state; - this.field = state.getField(); - this.stateConfig = state.getFacetsConfig(); + super(state); this.exec = exec; - dv = state.getDocValues(); counts = new AtomicIntegerArray(state.getSize()); if (hits == null) { // browse only @@ -99,181 +78,8 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { } @Override - public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { - validateTopN(topN); - FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); - - if (dimConfig.hierarchical) { - int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); - if (pathOrd < 0) { - // path was never indexed - return null; - } - SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim); - return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN); - } else { - if (path.length > 0) { - throw new IllegalArgumentException( - "Field is not configured as hierarchical, path should be 0 length"); - } - OrdRange ordRange = state.getOrdRange(dim); - if (ordRange == null) { - // means dimension was never indexed - return null; - } - int dimOrd = ordRange.start; - PrimitiveIterator.OfInt childIt = ordRange.iterator(); - if (dimConfig.multiValued && dimConfig.requireDimCount) { - // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed - // the dimension and we need to skip past it so the iterator is positioned on the first - // child: - childIt.next(); - } - return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN); - } - } - - /** - * Overloaded method to allow getPathResult be called without passing in the dimToChildOrdsResult - * parameter - */ - private FacetResult getPathResult( - FacetsConfig.DimConfig dimConfig, - String dim, - String[] path, - int pathOrd, - PrimitiveIterator.OfInt childOrds, - int topN) - throws IOException { - return getPathResult(dimConfig, dim, path, pathOrd, childOrds, topN, null); - } - - /** Returns path results for a dimension */ - private FacetResult getPathResult( - FacetsConfig.DimConfig dimConfig, - String dim, - String[] path, - int pathOrd, - PrimitiveIterator.OfInt childOrds, - int topN, - ChildOrdsResult dimToChildOrdsResult) - throws IOException { - - ChildOrdsResult childOrdsResult; - - // if getTopDims is called, get results from previously stored dimToChildOrdsResult, otherwise - // call getChildOrdsResult to get dimCount, childCount and the queue for the dimension's top - // children - if (dimToChildOrdsResult != null) { - childOrdsResult = dimToChildOrdsResult; - } else { - childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, pathOrd); - } - - if (childOrdsResult.q == null) { - return null; - } - - LabelAndValue[] labelValues = getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q); - - if (dimConfig.hierarchical == true) { - return new FacetResult( - dim, path, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount); - } else { - return new FacetResult( - dim, emptyPath, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount); - } - } - - /** - * Returns ChildOrdsResult that contains results of dimCount, childCount, and the queue for the - * dimension's top children to populate FacetResult in getPathResult. - */ - private ChildOrdsResult getChildOrdsResult( - PrimitiveIterator.OfInt childOrds, int topN, FacetsConfig.DimConfig dimConfig, int pathOrd) { - - TopOrdAndIntQueue q = null; - int bottomCount = 0; - int dimCount = 0; - int childCount = 0; - - TopOrdAndIntQueue.OrdAndValue reuse = null; - while (childOrds.hasNext()) { - int ord = childOrds.next(); - if (counts.get(ord) > 0) { - dimCount += counts.get(ord); - childCount++; - if (counts.get(ord) > bottomCount) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = counts.get(ord); - if (q == null) { - // Lazy init, so we don't create this for the - // sparse case unnecessarily - q = new TopOrdAndIntQueue(topN); - } - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomCount = q.top().value; - } - } - } - } - - if (dimConfig.hierarchical == true) { - dimCount = counts.get(pathOrd); - } else { - // see if dimCount is actually reliable or needs to be reset - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - dimCount = counts.get(pathOrd); - } else { - dimCount = -1; // dimCount is in accurate at this point, so set it to -1 - } - } - } - - return new ChildOrdsResult(dimCount, childCount, q); - } - - /** Returns label values for dims. */ - private LabelAndValue[] getLabelValuesFromTopOrdAndIntQueue(TopOrdAndIntQueue q) - throws IOException { - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - assert ordAndValue != null; - final BytesRef term = dv.lookupOrd(ordAndValue.ord); - String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); - labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); - } - return labelValues; - } - - /** Returns value/count of a dimension. */ - private int getDimValue( - FacetsConfig.DimConfig dimConfig, - String dim, - int dimOrd, - PrimitiveIterator.OfInt childOrds, - int topN, - HashMap dimToChildOrdsResult) { - - // if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at - // indexing time, return dimCount directly - if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) { - return counts.get(dimOrd); - } - - // if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount - ChildOrdsResult childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, dimOrd); - - // if no early termination, store dim and childOrdsResult into a hashmap to avoid calling - // getChildOrdsResult again in getPathResult - dimToChildOrdsResult.put(dim, childOrdsResult); - return childOrdsResult.dimCount; + int getCount(int ord) { + return counts.get(ord); } private class CountOneSegment implements Callable { @@ -535,195 +341,4 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { } } } - - @Override - public Number getSpecificValue(String dim, String... path) throws IOException { - if (path.length != 1) { - throw new IllegalArgumentException("path must be length=1"); - } - int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); - if (ord < 0) { - return -1; - } - - return counts.get(ord); - } - - /** - * Overloaded method to allow getFacetResultForDim be called without passing in the - * dimToChildOrdsResult parameter - */ - private FacetResult getFacetResultForDim(String dim, int topNChildren) throws IOException { - return getFacetResultForDim(dim, topNChildren, null); - } - - /** Returns FacetResult for a dimension. */ - private FacetResult getFacetResultForDim( - String dim, int topNChildren, ChildOrdsResult dimToChildOrdsResult) throws IOException { - - FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); - - if (dimConfig.hierarchical) { - SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim); - int dimOrd = dimTree.dimStartOrd; - return getPathResult( - dimConfig, - dim, - emptyPath, - dimOrd, - dimTree.iterator(), - topNChildren, - dimToChildOrdsResult); - } else { - OrdRange ordRange = state.getOrdRange(dim); - int dimOrd = ordRange.start; - PrimitiveIterator.OfInt childIt = ordRange.iterator(); - if (dimConfig.multiValued && dimConfig.requireDimCount) { - // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed - // the dimension and we need to skip past it so the iterator is positioned on the first - // child: - childIt.next(); - } - return getPathResult( - dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, dimToChildOrdsResult); - } - } - - @Override - public List getAllDims(int topN) throws IOException { - validateTopN(topN); - List results = new ArrayList<>(); - for (String dim : state.getDims()) { - FacetResult factResult = getFacetResultForDim(dim, topN); - if (factResult != null) { - results.add(factResult); - } - } - - // Sort by highest count: - Collections.sort( - results, - new Comparator() { - @Override - public int compare(FacetResult a, FacetResult b) { - if (a.value.intValue() > b.value.intValue()) { - return -1; - } else if (b.value.intValue() > a.value.intValue()) { - return 1; - } else { - return a.dim.compareTo(b.dim); - } - } - }); - - return results; - } - - @Override - public List getTopDims(int topNDims, int topNChildren) throws IOException { - if (topNDims <= 0 || topNChildren <= 0) { - throw new IllegalArgumentException("topN must be > 0"); - } - - // Creates priority queue to store top dimensions and sort by their aggregated values/hits and - // string values. - PriorityQueue pq = - new PriorityQueue<>(topNDims) { - @Override - protected boolean lessThan(DimValueResult a, DimValueResult b) { - if (a.value > b.value) { - return false; - } else if (a.value < b.value) { - return true; - } else { - return a.dim.compareTo(b.dim) > 0; - } - } - }; - - HashMap dimToChildOrdsResult = new HashMap<>(); - int dimCount; - - for (String dim : state.getDims()) { - FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); - if (dimConfig.hierarchical) { - SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim); - int dimOrd = dimTree.dimStartOrd; - // get dim value - dimCount = - getDimValue( - dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, dimToChildOrdsResult); - } else { - OrdRange ordRange = state.getOrdRange(dim); - int dimOrd = ordRange.start; - PrimitiveIterator.OfInt childIt = ordRange.iterator(); - if (dimConfig.multiValued && dimConfig.requireDimCount) { - // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed - // the dimension and we need to skip past it so the iterator is positioned on the first - // child: - childIt.next(); - } - dimCount = getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, dimToChildOrdsResult); - } - - if (dimCount != 0) { - // use priority queue to store DimValueResult for topNDims - if (pq.size() < topNDims) { - pq.add(new DimValueResult(dim, dimCount)); - } else { - if (dimCount > pq.top().value - || (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValueResult bottomDim = pq.top(); - bottomDim.dim = dim; - bottomDim.value = dimCount; - pq.updateTop(); - } - } - } - } - - // get FacetResult for topNDims - int resultSize = pq.size(); - FacetResult[] results = new FacetResult[resultSize]; - - while (pq.size() > 0) { - DimValueResult dimValueResult = pq.pop(); - FacetResult facetResult = - getFacetResultForDim( - dimValueResult.dim, topNChildren, dimToChildOrdsResult.get(dimValueResult.dim)); - resultSize--; - results[resultSize] = facetResult; - } - return Arrays.asList(results); - } - - /** - * Creates ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top - * children - */ - private static class ChildOrdsResult { - final int dimCount; - final int childCount; - final TopOrdAndIntQueue q; - - ChildOrdsResult(int dimCount, int childCount, TopOrdAndIntQueue q) { - this.dimCount = dimCount; - this.childCount = childCount; - this.q = q; - } - } - - /** - * Creates DimValueResult to store the label and value of dim in order to sort by these two - * fields. - */ - private static class DimValueResult { - String dim; - int value; - - DimValueResult(String dim, int value) { - this.dim = dim; - this.value = value; - } - } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java index 53e11b47e0a..3c626e90bc5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java @@ -17,23 +17,12 @@ package org.apache.lucene.facet.sortedset; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; import java.util.List; -import java.util.PrimitiveIterator; -import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetUtils; -import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsCollector.MatchingDocs; import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.LabelAndValue; -import org.apache.lucene.facet.TopOrdAndIntQueue; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.DimTree; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; @@ -48,9 +37,7 @@ import org.apache.lucene.search.ConjunctionUtils; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LongValues; -import org.apache.lucene.util.PriorityQueue; /** * Compute facets counts from previously indexed {@link SortedSetDocValuesFacetField}, without @@ -70,16 +57,9 @@ import org.apache.lucene.util.PriorityQueue; * * @lucene.experimental */ -public class SortedSetDocValuesFacetCounts extends Facets { - - final SortedSetDocValuesReaderState state; - final FacetsConfig stateConfig; - final SortedSetDocValues dv; - final String field; +public class SortedSetDocValuesFacetCounts extends AbstractSortedSetDocValueFacetCounts { final int[] counts; - private static final String[] emptyPath = new String[0]; - /** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */ public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state) throws IOException { this(state, null); @@ -88,10 +68,7 @@ public class SortedSetDocValuesFacetCounts extends Facets { /** Counts all facet dimensions across the provided hits. */ public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, FacetsCollector hits) throws IOException { - this.state = state; - this.field = state.getField(); - this.stateConfig = state.getFacetsConfig(); - this.dv = state.getDocValues(); + super(state); this.counts = new int[state.getSize()]; if (hits == null) { // browse only @@ -102,180 +79,8 @@ public class SortedSetDocValuesFacetCounts extends Facets { } @Override - public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { - validateTopN(topN); - FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); - - if (dimConfig.hierarchical) { - int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); - if (pathOrd < 0) { - // path was never indexed - return null; - } - DimTree dimTree = state.getDimTree(dim); - return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN); - } else { - if (path.length > 0) { - throw new IllegalArgumentException( - "Field is not configured as hierarchical, path should be 0 length"); - } - OrdRange ordRange = state.getOrdRange(dim); - if (ordRange == null) { - // means dimension was never indexed - return null; - } - int dimOrd = ordRange.start; - PrimitiveIterator.OfInt childIt = ordRange.iterator(); - if (dimConfig.multiValued && dimConfig.requireDimCount) { - // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed - // the dimension and we need to skip past it so the iterator is positioned on the first - // child: - childIt.next(); - } - return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN); - } - } - - /** - * Overloaded method to allow getPathResult be called without passing in the dimToChildOrdsResult - * parameter - */ - private FacetResult getPathResult( - FacetsConfig.DimConfig dimConfig, - String dim, - String[] path, - int pathOrd, - PrimitiveIterator.OfInt childOrds, - int topN) - throws IOException { - return getPathResult(dimConfig, dim, path, pathOrd, childOrds, topN, null); - } - - /** Returns path results for a dimension */ - private FacetResult getPathResult( - FacetsConfig.DimConfig dimConfig, - String dim, - String[] path, - int pathOrd, - PrimitiveIterator.OfInt childOrds, - int topN, - ChildOrdsResult dimToChildOrdsResult) - throws IOException { - - ChildOrdsResult childOrdsResult; - - // if getTopDims is called, get results from previously stored dimToChildOrdsResult, otherwise - // call getChildOrdsResult to get dimCount, childCount and the queue for the dimension's top - // children - if (dimToChildOrdsResult != null) { - childOrdsResult = dimToChildOrdsResult; - } else { - childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, pathOrd); - } - - if (childOrdsResult.q == null) { - return null; - } - - LabelAndValue[] labelValues = getLabelValuesFromTopOrdAndIntQueue(childOrdsResult.q); - - if (dimConfig.hierarchical == true) { - return new FacetResult( - dim, path, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount); - } else { - return new FacetResult( - dim, emptyPath, childOrdsResult.dimCount, labelValues, childOrdsResult.childCount); - } - } - - /** - * Returns SortedSetDocValuesChildOrdsResult that contains results of dimCount, childCount, and - * the queue for the dimension's top children to populate FacetResult in getPathResult. - */ - private ChildOrdsResult getChildOrdsResult( - PrimitiveIterator.OfInt childOrds, int topN, FacetsConfig.DimConfig dimConfig, int pathOrd) { - TopOrdAndIntQueue q = null; - int bottomCount = 0; - int dimCount = 0; - int childCount = 0; - - TopOrdAndIntQueue.OrdAndValue reuse = null; - while (childOrds.hasNext()) { - int ord = childOrds.next(); - if (counts[ord] > 0) { - dimCount += counts[ord]; - childCount++; - if (counts[ord] > bottomCount) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = counts[ord]; - if (q == null) { - // Lazy init, so we don't create this for the - // sparse case unnecessarily - q = new TopOrdAndIntQueue(topN); - } - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomCount = q.top().value; - } - } - } - } - - if (dimConfig.hierarchical == true) { - dimCount = counts[pathOrd]; - } else { - // see if dimCount is actually reliable or needs to be reset - if (dimConfig.multiValued) { - if (dimConfig.requireDimCount) { - dimCount = counts[pathOrd]; - } else { - dimCount = -1; // dimCount is in accurate at this point, so set it to -1 - } - } - } - - return new ChildOrdsResult(dimCount, childCount, q); - } - - /** Returns label values for dims. */ - private LabelAndValue[] getLabelValuesFromTopOrdAndIntQueue(TopOrdAndIntQueue q) - throws IOException { - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - assert ordAndValue != null; - final BytesRef term = dv.lookupOrd(ordAndValue.ord); - String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); - labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); - } - return labelValues; - } - - /** Returns value/count of a dimension. */ - private int getDimValue( - FacetsConfig.DimConfig dimConfig, - String dim, - int dimOrd, - PrimitiveIterator.OfInt childOrds, - int topN, - HashMap dimToChildOrdsResult) { - - // if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at - // indexing time, return dimCount directly - if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) { - return counts[dimOrd]; - } - - // if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount - ChildOrdsResult childOrdsResult = getChildOrdsResult(childOrds, topN, dimConfig, dimOrd); - - // if no early termination, store dim and childOrdsResult into a hashmap to avoid calling - // getChildOrdsResult again in getPathResult - dimToChildOrdsResult.put(dim, childOrdsResult); - return childOrdsResult.dimCount; + int getCount(int ord) { + return counts[ord]; } // Variant of countOneSegment, that has No Hits or Live Docs @@ -507,193 +312,4 @@ public class SortedSetDocValuesFacetCounts extends Facets { } } } - - @Override - public Number getSpecificValue(String dim, String... path) throws IOException { - if (path.length != 1) { - throw new IllegalArgumentException("path must be length=1"); - } - int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); - if (ord < 0) { - return -1; - } - - return counts[ord]; - } - - /** - * Overloaded method to allow getFacetResultForDim be called without passing in the - * dimToChildOrdsResult parameter - */ - private FacetResult getFacetResultForDim(String dim, int topNChildren) throws IOException { - return getFacetResultForDim(dim, topNChildren, null); - } - - /** Returns FacetResult for a dimension. */ - private FacetResult getFacetResultForDim( - String dim, int topNChildren, ChildOrdsResult dimToChildOrdsResult) throws IOException { - - FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); - - if (dimConfig.hierarchical) { - DimTree dimTree = state.getDimTree(dim); - int dimOrd = dimTree.dimStartOrd; - return getPathResult( - dimConfig, - dim, - emptyPath, - dimOrd, - dimTree.iterator(), - topNChildren, - dimToChildOrdsResult); - } else { - OrdRange ordRange = state.getOrdRange(dim); - int dimOrd = ordRange.start; - PrimitiveIterator.OfInt childIt = ordRange.iterator(); - if (dimConfig.multiValued && dimConfig.requireDimCount) { - // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed - // the dimension and we need to skip past it so the iterator is positioned on the first - // child: - childIt.next(); - } - return getPathResult( - dimConfig, dim, emptyPath, dimOrd, childIt, topNChildren, dimToChildOrdsResult); - } - } - - @Override - public List getAllDims(int topN) throws IOException { - validateTopN(topN); - List results = new ArrayList<>(); - for (String dim : state.getDims()) { - FacetResult factResult = getFacetResultForDim(dim, topN); - if (factResult != null) { - results.add(factResult); - } - } - - // Sort by highest count: - Collections.sort( - results, - new Comparator() { - @Override - public int compare(FacetResult a, FacetResult b) { - if (a.value.intValue() > b.value.intValue()) { - return -1; - } else if (b.value.intValue() > a.value.intValue()) { - return 1; - } else { - return a.dim.compareTo(b.dim); - } - } - }); - return results; - } - - @Override - public List getTopDims(int topNDims, int topNChildren) throws IOException { - validateTopN(topNDims); - validateTopN(topNChildren); - - // Creates priority queue to store top dimensions and sort by their aggregated values/hits and - // string values. - PriorityQueue pq = - new PriorityQueue<>(topNDims) { - @Override - protected boolean lessThan(DimValueResult a, DimValueResult b) { - if (a.value > b.value) { - return false; - } else if (a.value < b.value) { - return true; - } else { - return a.dim.compareTo(b.dim) > 0; - } - } - }; - - HashMap dimToChildOrdsResult = new HashMap<>(); - int dimCount; - - for (String dim : state.getDims()) { - FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim); - if (dimConfig.hierarchical) { - DimTree dimTree = state.getDimTree(dim); - int dimOrd = dimTree.dimStartOrd; - // get dim value - dimCount = - getDimValue( - dimConfig, dim, dimOrd, dimTree.iterator(), topNChildren, dimToChildOrdsResult); - } else { - OrdRange ordRange = state.getOrdRange(dim); - int dimOrd = ordRange.start; - PrimitiveIterator.OfInt childIt = ordRange.iterator(); - if (dimConfig.multiValued && dimConfig.requireDimCount) { - // If the dim is multi-valued and requires dim counts, we know we've explicitly indexed - // the dimension and we need to skip past it so the iterator is positioned on the first - // child: - childIt.next(); - } - dimCount = getDimValue(dimConfig, dim, dimOrd, childIt, topNChildren, dimToChildOrdsResult); - } - - if (dimCount != 0) { - // use priority queue to store DimValueResult for topNDims - if (pq.size() < topNDims) { - pq.add(new DimValueResult(dim, dimCount)); - } else { - if (dimCount > pq.top().value - || (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValueResult bottomDim = pq.top(); - bottomDim.dim = dim; - bottomDim.value = dimCount; - pq.updateTop(); - } - } - } - } - - // get FacetResult for topNDims - int resultSize = pq.size(); - FacetResult[] results = new FacetResult[resultSize]; - - while (pq.size() > 0) { - DimValueResult dimValueResult = pq.pop(); - FacetResult facetResult = - getFacetResultForDim( - dimValueResult.dim, topNChildren, dimToChildOrdsResult.get(dimValueResult.dim)); - resultSize--; - results[resultSize] = facetResult; - } - return Arrays.asList(results); - } - - /** - * Creates ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top - * children - */ - private static class ChildOrdsResult { - final int dimCount; - final int childCount; - final TopOrdAndIntQueue q; - - ChildOrdsResult(int dimCount, int childCount, TopOrdAndIntQueue q) { - this.dimCount = dimCount; - this.childCount = childCount; - this.q = q; - } - } - - /** - * Creates DimValueResult to store the label and value of dim in order to sort by these two - * fields. - */ - private static class DimValueResult { - String dim; - int value; - - DimValueResult(String dim, int value) { - this.dim = dim; - this.value = value; - } - } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java index 342bb77714b..8c9853d1641 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java @@ -39,9 +39,6 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { /** Per-ordinal value. */ final float[] values; - /** Pass in emptyPath for getTopDims and getAllDims. */ - private static final String[] emptyPath = new String[0]; - /** Sole constructor. */ FloatTaxonomyFacets( String indexFieldName, @@ -114,21 +111,15 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { return null; } - ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN); - if (childOrdsResult.aggregatedValue == 0) { - return null; - } - - LabelAndValue[] labelValues = getLabelValues(childOrdsResult.q, cp.length); - return new FacetResult( - dim, path, childOrdsResult.aggregatedValue, labelValues, childOrdsResult.childCount); + TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); + return createFacetResult(topChildrenForPath, dim, path); } /** - * Return ChildOrdsResult that contains results of aggregatedValue, childCount, and the queue for - * the dimension's top children to populate FacetResult in getPathResult. + * Determine the top-n children for a specified dimension + path. Results are in an intermediate + * form. */ - private ChildOrdsResult getChildOrdsResult(DimConfig dimConfig, int dimOrd, int topN) + private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) throws IOException { TopOrdAndFloatQueue q = new TopOrdAndFloatQueue(Math.min(taxoReader.getSize(), topN)); @@ -137,7 +128,7 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { int[] children = getChildren(); int[] siblings = getSiblings(); - int ord = children[dimOrd]; + int ord = children[pathOrd]; float aggregatedValue = 0; int childCount = 0; @@ -164,61 +155,50 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { - aggregatedValue = values[dimOrd]; + aggregatedValue = values[pathOrd]; } else { // Our sum'd count is not correct, in general: aggregatedValue = -1; } } - return new ChildOrdsResult(aggregatedValue, childCount, q); + return new TopChildrenForPath(aggregatedValue, childCount, q); } /** - * Return label and values for top dimensions and children - * - * @param q the queue for the dimension's top children - * @param pathLength the length of a dimension's children paths + * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work + * of resolving ordinals -> labels, etc. Will return null if there are no children. */ - private LabelAndValue[] getLabelValues(TopOrdAndFloatQueue q, int pathLength) throws IOException { + FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) + throws IOException { + // If the intermediate result is null or there are no children, we return null: + if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { + return null; + } + + TopOrdAndFloatQueue q = topChildrenForPath.childQueue; + assert q != null; + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; int[] ordinals = new int[labelValues.length]; float[] values = new float[labelValues.length]; for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndFloatQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; ordinals[i] = ordAndValue.ord; values[i] = ordAndValue.value; } FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + // The path component we're interested in is the one immediately after the provided path. We + // add 1 here to also account for the dim: + int childComponentIdx = path.length + 1; for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[pathLength], values[i]); - } - return labelValues; - } - - /** Return value of a dimension. */ - private float getDimValue( - FacetsConfig.DimConfig dimConfig, - String dim, - int dimOrd, - int topN, - HashMap dimToChildOrdsResult) - throws IOException { - - // if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at - // indexing time, return dimCount directly - if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) { - return values[dimOrd]; + labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); } - // if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount - ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN); - - // if no early termination, store dim and childOrdsResult into a hashmap to avoid calling - // getChildOrdsResult again in getTopDims - dimToChildOrdsResult.put(dim, childOrdsResult); - return childOrdsResult.aggregatedValue; + return new FacetResult( + dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); } @Override @@ -232,10 +212,10 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { // Create priority queue to store top dimensions and sort by their aggregated values/hits and // string values. - PriorityQueue pq = + PriorityQueue pq = new PriorityQueue<>(topNDims) { @Override - protected boolean lessThan(DimValueResult a, DimValueResult b) { + protected boolean lessThan(DimValue a, DimValue b) { if (a.value > b.value) { return false; } else if (a.value < b.value) { @@ -246,8 +226,8 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { } }; - // create hashMap to store the ChildOrdsResult to avoid calling getChildOrdsResult for all dims - HashMap dimToChildOrdsResult = new HashMap<>(); + // Keep track of intermediate results, if we compute them, so we can reuse them later: + Map intermediateResults = null; // iterate over children and siblings ordinals for all dims int ord = children[TaxonomyReader.ROOT_ORDINAL]; @@ -255,22 +235,42 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { String dim = taxoReader.getPath(ord).components[0]; FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); if (dimConfig.indexFieldName.equals(indexFieldName)) { - FacetLabel cp = new FacetLabel(dim, emptyPath); + FacetLabel cp = new FacetLabel(dim); int dimOrd = taxoReader.getOrdinal(cp); - float dimCount = 0; - // if dimOrd = -1, we skip this dim, else call getDimValue if (dimOrd != -1) { - dimCount = getDimValue(dimConfig, dim, dimOrd, topNChildren, dimToChildOrdsResult); - if (dimCount != 0) { - // use priority queue to store DimValueResult for topNDims - if (pq.size() < topNDims) { - pq.add(new DimValueResult(dim, dimOrd, dimCount)); + float dimValue; + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + // If the dim is configured as multi-valued and requires dim counts, we can access + // an accurate count for the dim computed at indexing time: + dimValue = values[dimOrd]; } else { - if (dimCount > pq.top().value - || (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValueResult bottomDim = pq.top(); + // If the dim is configured as multi-valued but not requiring dim counts, we cannot + // compute an accurate dim count, and use -1 as a place-holder: + dimValue = -1; + } + } else { + // Single-valued dims require aggregating descendant paths to get accurate dim counts + // since we don't directly access ancestry paths: + // TODO: We could consider indexing dim counts directly if getTopDims is a common + // use-case. + TopChildrenForPath topChildrenForPath = + getTopChildrenForPath(dimConfig, dimOrd, topNChildren); + if (intermediateResults == null) { + intermediateResults = new HashMap<>(); + } + intermediateResults.put(dim, topChildrenForPath); + dimValue = topChildrenForPath.pathValue; + } + if (dimValue != 0) { + if (pq.size() < topNDims) { + pq.add(new DimValue(dim, dimOrd, dimValue)); + } else { + if (dimValue > pq.top().value + || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { + DimValue bottomDim = pq.top(); bottomDim.dim = dim; - bottomDim.value = dimCount; + bottomDim.value = dimValue; pq.updateTop(); } } @@ -280,63 +280,40 @@ abstract class FloatTaxonomyFacets extends TaxonomyFacets { ord = siblings[ord]; } - // use fixed-size array to reduce space usage FacetResult[] results = new FacetResult[pq.size()]; while (pq.size() > 0) { - DimValueResult dimValueResult = pq.pop(); - String dim = dimValueResult.dim; - ChildOrdsResult childOrdsResult; - // if the childOrdsResult was stored in the map, avoid calling getChildOrdsResult again - if (dimToChildOrdsResult.containsKey(dim)) { - childOrdsResult = dimToChildOrdsResult.get(dim); - } else { - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - childOrdsResult = getChildOrdsResult(dimConfig, dimValueResult.dimOrd, topNChildren); + DimValue dimValue = pq.pop(); + assert dimValue != null; + String dim = dimValue.dim; + TopChildrenForPath topChildrenForPath = null; + if (intermediateResults != null) { + topChildrenForPath = intermediateResults.get(dim); } - // FacetResult requires String[] path, and path is always empty for getTopDims. - // pathLength is always equal to 1 when FacetLabel is constructed with - // FacetLabel(dim, emptyPath), and therefore, 1 is passed in when calling getLabelValues - FacetResult facetResult = - new FacetResult( - dimValueResult.dim, - emptyPath, - dimValueResult.value, - getLabelValues(childOrdsResult.q, 1), - childOrdsResult.childCount); + if (topChildrenForPath == null) { + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); + } + FacetResult facetResult = createFacetResult(topChildrenForPath, dim); + assert facetResult != null; results[pq.size()] = facetResult; } return Arrays.asList(results); } - /** - * Create DimValueResult to store the label, dim ordinal and dim count of a dim in priority queue - */ - private static class DimValueResult { + private static class DimValue { String dim; int dimOrd; float value; - DimValueResult(String dim, int dimOrd, float value) { + DimValue(String dim, int dimOrd, float value) { this.dim = dim; this.dimOrd = dimOrd; this.value = value; } } - /** - * Create ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top - * children - */ - private static class ChildOrdsResult { - final float aggregatedValue; - final int childCount; - final TopOrdAndFloatQueue q; - - ChildOrdsResult(float aggregatedValue, int childCount, TopOrdAndFloatQueue q) { - this.aggregatedValue = aggregatedValue; - this.childCount = childCount; - this.q = q; - } - } + /** Intermediate result to store top children for a given path before resolving labels, etc. */ + private record TopChildrenForPath( + float pathValue, int childCount, TopOrdAndFloatQueue childQueue) {} } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java index b6c2fdcf22c..3992cdc3357 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java @@ -44,9 +44,6 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { /** Sparse ordinal values. */ final IntIntHashMap sparseValues; - /** Pass in emptyPath for getTopDims and getAllDims. */ - private static final String[] emptyPath = new String[0]; - /** Sole constructor. */ IntTaxonomyFacets( String indexFieldName, @@ -176,46 +173,15 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { return null; } - ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN); - - if (childOrdsResult.q == null || childOrdsResult.aggregatedValue == 0) { - return null; - } - - LabelAndValue[] labelValues = getLabelValues(childOrdsResult.q, cp.length); - return new FacetResult( - dim, path, childOrdsResult.aggregatedValue, labelValues, childOrdsResult.childCount); + TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); + return createFacetResult(topChildrenForPath, dim, path); } /** - * Return label and values for top dimensions and children - * - * @param q the queue for the dimension's top children - * @param pathLength the length of a dimension's children paths + * Determine the top-n children for a specified dimension + path. Results are in an intermediate + * form. */ - private LabelAndValue[] getLabelValues(TopOrdAndIntQueue q, int pathLength) throws IOException { - LabelAndValue[] labelValues = new LabelAndValue[q.size()]; - int[] ordinals = new int[labelValues.length]; - int[] values = new int[labelValues.length]; - - for (int i = labelValues.length - 1; i >= 0; i--) { - TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); - ordinals[i] = ordAndValue.ord; - values[i] = ordAndValue.value; - } - - FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); - for (int i = 0; i < labelValues.length; i++) { - labelValues[i] = new LabelAndValue(bulkPath[i].components[pathLength], values[i]); - } - return labelValues; - } - - /** - * Return ChildOrdsResult that contains results of dimCount, childCount, and the queue for the - * dimension's top children to populate FacetResult in getPathResult. - */ - private ChildOrdsResult getChildOrdsResult(DimConfig dimConfig, int dimOrd, int topN) + private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) throws IOException { TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); int bottomValue = 0; @@ -230,7 +196,7 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { for (IntIntCursor c : sparseValues) { int value = c.value; int ord = c.key; - if (parents[ord] == dimOrd && value > 0) { + if (parents[ord] == pathOrd && value > 0) { aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); childCount++; if (value > bottomValue) { @@ -249,7 +215,7 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { } else { int[] children = getChildren(); int[] siblings = getSiblings(); - int ord = children[dimOrd]; + int ord = children[pathOrd]; while (ord != TaxonomyReader.INVALID_ORDINAL) { int value = values[ord]; if (value > 0) { @@ -273,38 +239,14 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { - aggregatedValue = getValue(dimOrd); + aggregatedValue = getValue(pathOrd); } else { // Our sum'd value is not correct, in general: aggregatedValue = -1; } } - return new ChildOrdsResult(aggregatedValue, childCount, q); - } - - /** Return value/count of a dimension. */ - private int getDimValue( - FacetsConfig.DimConfig dimConfig, - String dim, - int dimOrd, - int topN, - HashMap dimToChildOrdsResult) - throws IOException { - - // if dimConfig.hierarchical == true || dim is multiValued and dim count has been aggregated at - // indexing time, return dimCount directly - if (dimConfig.hierarchical == true || (dimConfig.multiValued && dimConfig.requireDimCount)) { - return getValue(dimOrd); - } - - // if dimCount was not aggregated at indexing time, iterate over childOrds to get dimCount - ChildOrdsResult childOrdsResult = getChildOrdsResult(dimConfig, dimOrd, topN); - - // if no early termination, store dim and childOrdsResult into a hashmap to avoid calling - // getChildOrdsResult again in getTopDims - dimToChildOrdsResult.put(dim, childOrdsResult); - return childOrdsResult.aggregatedValue; + return new TopChildrenForPath(aggregatedValue, childCount, q); } @Override @@ -319,10 +261,10 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { // Create priority queue to store top dimensions and sort by their aggregated values/hits and // string values. - PriorityQueue pq = + PriorityQueue pq = new PriorityQueue<>(topNDims) { @Override - protected boolean lessThan(DimValueResult a, DimValueResult b) { + protected boolean lessThan(DimValue a, DimValue b) { if (a.value > b.value) { return false; } else if (a.value < b.value) { @@ -333,8 +275,8 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { } }; - // create hashMap to store the ChildOrdsResult to avoid calling getChildOrdsResult for all dims - HashMap dimToChildOrdsResult = new HashMap<>(); + // Keep track of intermediate results, if we compute them, so we can reuse them later: + Map intermediateResults = null; // iterate over children and siblings ordinals for all dims int ord = children[TaxonomyReader.ROOT_ORDINAL]; @@ -342,21 +284,42 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { String dim = taxoReader.getPath(ord).components[0]; FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); if (dimConfig.indexFieldName.equals(indexFieldName)) { - FacetLabel cp = new FacetLabel(dim, emptyPath); + FacetLabel cp = new FacetLabel(dim); int dimOrd = taxoReader.getOrdinal(cp); - // if dimOrd = -1, we skip this dim, else call getDimValue if (dimOrd != -1) { - int dimCount = getDimValue(dimConfig, dim, dimOrd, topNChildren, dimToChildOrdsResult); - if (dimCount != 0) { - // use priority queue to store DimValueResult for topNDims - if (pq.size() < topNDims) { - pq.add(new DimValueResult(dim, dimOrd, dimCount)); + int dimValue; + if (dimConfig.multiValued) { + if (dimConfig.requireDimCount) { + // If the dim is configured as multi-valued and requires dim counts, we can access + // an accurate count for the dim computed at indexing time: + dimValue = getValue(dimOrd); } else { - if (dimCount > pq.top().value - || (dimCount == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { - DimValueResult bottomDim = pq.top(); + // If the dim is configured as multi-valued but not requiring dim counts, we cannot + // compute an accurate dim count, and use -1 as a place-holder: + dimValue = -1; + } + } else { + // Single-valued dims require aggregating descendant paths to get accurate dim counts + // since we don't directly access ancestry paths: + // TODO: We could consider indexing dim counts directly if getTopDims is a common + // use-case. + TopChildrenForPath topChildrenForPath = + getTopChildrenForPath(dimConfig, dimOrd, topNChildren); + if (intermediateResults == null) { + intermediateResults = new HashMap<>(); + } + intermediateResults.put(dim, topChildrenForPath); + dimValue = topChildrenForPath.pathValue; + } + if (dimValue != 0) { + if (pq.size() < topNDims) { + pq.add(new DimValue(dim, dimOrd, dimValue)); + } else { + if (dimValue > pq.top().value + || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { + DimValue bottomDim = pq.top(); bottomDim.dim = dim; - bottomDim.value = dimCount; + bottomDim.value = dimValue; pq.updateTop(); } } @@ -366,63 +329,76 @@ abstract class IntTaxonomyFacets extends TaxonomyFacets { ord = siblings[ord]; } - // use fixed-size array to reduce space usage FacetResult[] results = new FacetResult[pq.size()]; while (pq.size() > 0) { - DimValueResult dimValueResult = pq.pop(); - String dim = dimValueResult.dim; - ChildOrdsResult childOrdsResult; - // if the childOrdsResult was stored in the map, avoid calling getChildOrdsResult again - if (dimToChildOrdsResult.containsKey(dim)) { - childOrdsResult = dimToChildOrdsResult.get(dim); - } else { - FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); - childOrdsResult = getChildOrdsResult(dimConfig, dimValueResult.dimOrd, topNChildren); + DimValue dimValue = pq.pop(); + assert dimValue != null; + String dim = dimValue.dim; + TopChildrenForPath topChildrenForPath = null; + if (intermediateResults != null) { + topChildrenForPath = intermediateResults.get(dim); } - // FacetResult requires String[] path, and path is always empty for getTopDims. - // pathLength is always equal to 1 when FacetLabel is constructed with - // FacetLabel(dim, emptyPath), and therefore, 1 is passed in when calling getLabelValues - FacetResult facetResult = - new FacetResult( - dimValueResult.dim, - emptyPath, - dimValueResult.value, - getLabelValues(childOrdsResult.q, 1), - childOrdsResult.childCount); + if (topChildrenForPath == null) { + FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); + topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); + } + FacetResult facetResult = createFacetResult(topChildrenForPath, dim); + assert facetResult != null; results[pq.size()] = facetResult; } return Arrays.asList(results); } /** - * Create DimValueResult to store the label, dim ordinal and dim count of a dim in priority queue + * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work + * of resolving ordinals -> labels, etc. Will return null if there are no children. */ - private static class DimValueResult { + FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) + throws IOException { + // If the intermediate result is null or there are no children, we return null: + if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { + return null; + } + + TopOrdAndIntQueue q = topChildrenForPath.childQueue; + assert q != null; + + LabelAndValue[] labelValues = new LabelAndValue[q.size()]; + int[] ordinals = new int[labelValues.length]; + int[] values = new int[labelValues.length]; + + for (int i = labelValues.length - 1; i >= 0; i--) { + TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; + ordinals[i] = ordAndValue.ord; + values[i] = ordAndValue.value; + } + + FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); + // The path component we're interested in is the one immediately after the provided path. We + // add 1 here to also account for the dim: + int childComponentIdx = path.length + 1; + for (int i = 0; i < labelValues.length; i++) { + labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); + } + + return new FacetResult( + dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); + } + + private static class DimValue { String dim; int dimOrd; int value; - DimValueResult(String dim, int dimOrd, int value) { + DimValue(String dim, int dimOrd, int value) { this.dim = dim; this.dimOrd = dimOrd; this.value = value; } } - /** - * Create ChildOrdsResult to store dimCount, childCount, and the queue for the dimension's top - * children - */ - private static class ChildOrdsResult { - final int aggregatedValue; - final int childCount; - final TopOrdAndIntQueue q; - - ChildOrdsResult(int aggregatedValue, int childCount, TopOrdAndIntQueue q) { - this.aggregatedValue = aggregatedValue; - this.childCount = childCount; - this.q = q; - } - } + /** Intermediate result to store top children for a given path before resolving labels, etc. */ + private record TopChildrenForPath(int pathValue, int childCount, TopOrdAndIntQueue childQueue) {} } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java index e95acb037b8..ee5abb0abd3 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacets.java @@ -19,7 +19,6 @@ package org.apache.lucene.facet.taxonomy; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Locale; @@ -156,7 +155,7 @@ abstract class TaxonomyFacets extends Facets { } // Sort by highest value, tie break by dim: - Collections.sort(results, BY_VALUE_THEN_DIM); + results.sort(BY_VALUE_THEN_DIM); return results; } }