diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java index 633033b50ef..68f79fa27ee 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetsConfig.java @@ -471,19 +471,35 @@ public class FacetsConfig { private void processSSDVFacetFields( Map> byField, Document doc) { + for (Map.Entry> ent : byField.entrySet()) { String indexFieldName = ent.getKey(); for (SortedSetDocValuesFacetField facetField : ent.getValue()) { - FacetLabel facetLabel = new FacetLabel(facetField.dim, facetField.label); - String fullPath = pathToString(facetLabel.components, facetLabel.length); - - // For facet counts: - doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath))); - + FacetLabel facetLabel = new FacetLabel(facetField.dim, facetField.path); + DimConfig dimConfig = getDimConfig(facetField.dim); + if (dimConfig.hierarchical) { + for (int i = 0; i < facetLabel.length; i++) { + String fullPath = pathToString(facetLabel.components, i + 1); + // For facet counts: + doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath))); + } + } else { + if (facetLabel.length != 2) { + throw new IllegalArgumentException( + "dimension \"" + + facetField.dim + + "\" is not hierarchical yet has " + + facetField.path.length + + " components"); + } + String fullPath = pathToString(facetLabel.components, facetLabel.length); + // For facet counts: + doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath))); + } // For drill-down: - indexDrillDownTerms(doc, indexFieldName, getDimConfig(facetField.dim), facetLabel); + indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel); } } } @@ -538,7 +554,7 @@ public class FacetsConfig { private static final char ESCAPE_CHAR = '\u001E'; /** Turns a dim + path into an encoded string. */ - public static String pathToString(String dim, String[] path) { + public static String pathToString(String dim, String... path) { String[] fullPath = new String[1 + path.length]; fullPath[0] = dim; System.arraycopy(path, 0, fullPath, 1, path.length); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java index 1ee786308e4..5035235b1ef 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java @@ -22,7 +22,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; -import java.util.Map; +import java.util.Objects; +import java.util.PrimitiveIterator; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -64,10 +65,13 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { final ExecutorService exec; final SortedSetDocValuesReaderState state; + final FacetsConfig stateConfig; final SortedSetDocValues dv; final String field; final AtomicIntegerArray counts; + private static final String[] emptyPath = new String[0]; + /** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */ public ConcurrentSortedSetDocValuesFacetCounts( SortedSetDocValuesReaderState state, ExecutorService exec) @@ -81,6 +85,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { throws IOException, InterruptedException { this.state = state; this.field = state.getField(); + this.stateConfig = Objects.requireNonNullElse(state.getFacetsConfig(), new FacetsConfig()); this.exec = exec; dv = state.getDocValues(); counts = new AtomicIntegerArray(state.getSize()); @@ -97,17 +102,32 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { if (topN <= 0) { throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")"); } - if (path.length > 0) { - throw new IllegalArgumentException("path should be 0 length"); + + if (stateConfig.getDimConfig(dim).hierarchical) { + int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); + if (pathOrd < 0) { + // path was never indexed + return null; + } + SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim); + return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN); + } else { + if (path.length > 0) { + throw new IllegalArgumentException( + "Field is not configured as hierarchical, path should be 0 length"); + } + OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + // means dimension was never indexed + return null; + } + return getDim(dim, null, -1, ordRange.iterator(), topN); } - OrdRange ordRange = state.getOrdRange(dim); - if (ordRange == null) { - return null; // means dimension was never indexed - } - return getDim(dim, ordRange, topN); } - private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException { + private FacetResult getDim( + String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN) + throws IOException { TopOrdAndIntQueue q = null; @@ -118,7 +138,8 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { TopOrdAndIntQueue.OrdAndValue reuse = null; - for (int ord = ordRange.start; ord <= ordRange.end; ord++) { + while (childOrds.hasNext()) { + int ord = childOrds.next(); if (counts.get(ord) > 0) { dimCount += counts.get(ord); childCount++; @@ -148,12 +169,19 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { LabelAndValue[] labelValues = new LabelAndValue[q.size()]; for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; final BytesRef term = dv.lookupOrd(ordAndValue.ord); String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); - labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value); + labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); } - return new FacetResult(dim, new String[0], dimCount, labelValues, childCount); + if (pathOrd == -1) { + // not hierarchical facet + return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount); + } else { + // hierarchical facet + return new FacetResult(dim, path, counts.get(pathOrd), labelValues, childCount); + } } private class CountOneSegment implements Callable { @@ -365,10 +393,19 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets { public List getAllDims(int topN) throws IOException { List results = new ArrayList<>(); - for (Map.Entry ent : state.getPrefixToOrdRange().entrySet()) { - FacetResult fr = getDim(ent.getKey(), ent.getValue(), topN); - if (fr != null) { - results.add(fr); + for (String dim : state.getDims()) { + if (stateConfig.getDimConfig(dim).hierarchical) { + SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim); + FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN); + if (fr != null) { + results.add(fr); + } + } else { + OrdRange ordRange = state.getOrdRange(dim); + FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN); + if (fr != null) { + results.add(fr); + } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java index 08ad9e9e2b4..264228017b7 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java @@ -17,12 +17,15 @@ package org.apache.lucene.facet.sortedset; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; +import java.util.Iterator; +import java.util.List; import java.util.Map; +import java.util.Stack; import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; @@ -51,20 +54,42 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead private final Map cachedOrdMaps = new HashMap<>(); + private final FacetsConfig config; + + /** Used for hierarchical dims. */ + private final Map prefixToDimTree = new HashMap<>(); + + /** Used for flat dims. */ private final Map prefixToOrdRange = new HashMap<>(); /** - * Creates this, pulling doc values from the default {@link + * Creates this with a config, pulling doc values from the default {@link + * FacetsConfig#DEFAULT_INDEX_FIELD_NAME}. + */ + public DefaultSortedSetDocValuesReaderState(IndexReader reader, FacetsConfig config) + throws IOException { + this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME, config); + } + + /** + * Creates this without a config, pulling doc values from the default {@link * FacetsConfig#DEFAULT_INDEX_FIELD_NAME}. */ public DefaultSortedSetDocValuesReaderState(IndexReader reader) throws IOException { - this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME); + this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME, null); + } + + /** Creates this without a config, pulling doc values from the specified field. */ + public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException { + this(reader, field, null); } /** Creates this, pulling doc values from the specified field. */ - public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException { + public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field, FacetsConfig config) + throws IOException { this.field = field; this.reader = reader; + this.config = config; // We need this to create thread-safe MultiSortedSetDV // per collector: @@ -79,38 +104,139 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead } valueCount = (int) dv.getValueCount(); - // TODO: we can make this more efficient if eg we can be - // "involved" when OrdinalMap is being created? Ie see - // each term/ord it's assigning as it goes... - String lastDim = null; - int startOrd = -1; - - // TODO: this approach can work for full hierarchy?; - // TaxoReader can't do this since ords are not in - // "sorted order" ... but we should generalize this to - // support arbitrary hierarchy: - for (int ord = 0; ord < valueCount; ord++) { - final BytesRef term = dv.lookupOrd(ord); + int ord = 0; + while (ord != valueCount) { + BytesRef term = dv.lookupOrd(ord); String[] components = FacetsConfig.stringToPath(term.utf8ToString()); - if (components.length != 2) { - throw new IllegalArgumentException( - "this class can only handle 2 level hierarchy (dim/value); got: " - + Arrays.toString(components) - + " " - + term.utf8ToString()); + String dim = components[0]; + if (config != null && config.getDimConfig(dim).hierarchical) { + ord = createOneHierarchicalFacetDimState(dv, ord) + 1; + } else { + ord = createOneFlatFacetDimState(dv, ord) + 1; } - if (!components[0].equals(lastDim)) { - if (lastDim != null) { - prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord - 1)); + } + } + + // returns last ord of dimension + private int createOneHierarchicalFacetDimState(SortedSetDocValues dv, int dimStartOrd) + throws IOException { + List hasChildren = new ArrayList<>(); + List siblings = new ArrayList<>(); + + // stack of paths with unfulfilled siblings + Stack siblingStack = new Stack<>(); + + int dimEndOrd = dimStartOrd; + + BytesRef nextTerm = dv.lookupOrd(dimEndOrd); + String[] nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString()); + String dim = nextComponents[0]; + + while (true) { + String[] components = nextComponents; + + int ord = dimEndOrd - dimStartOrd; + + while (siblingStack.empty() == false + && siblingStack.peek().component.length >= components.length) { + OrdAndComponent possibleSibling = siblingStack.pop(); + if (possibleSibling.component.length == components.length) { + // lengths are equal, all non-siblings of equal length will have already been popped off + // so this must be sibling + siblings.set(possibleSibling.ord, ord); } - startOrd = ord; - lastDim = components[0]; } + + if (dimEndOrd + 1 == valueCount) { + // current ord needs to be added, can't have children or siblings + siblings.add(-1); + hasChildren.add(false); + break; + } + + nextTerm = dv.lookupOrd(dimEndOrd + 1); + nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString()); + + if (nextComponents[0].equals(components[0]) == false) { + // current ord needs to be added, can't have children or siblings + siblings.add(-1); + hasChildren.add(false); + break; + } + + if (components.length < nextComponents.length) { + // next ord must be a direct child of current ord, this is because we are indexing all + // ancestral paths + hasChildren.add(ord, true); + // we don't know if this ord has a sibling or where it's sibling could be yet + siblingStack.push(new OrdAndComponent(ord, components)); + // we still add INVALID_ORDINAL, which will be replaced if a valid sibling is found + siblings.add(ord, INVALID_ORDINAL); + } else if (components.length == nextComponents.length) { + // next ord must be a sibling of current and there are no direct children of current, this + // is because we + // are indexing all ancestral paths + siblings.add(ord, ord + 1); + hasChildren.add(ord, false); + } else { + // components.length > nextComponents.length + // next ord is neither sibling nor child + siblings.add(ord, INVALID_ORDINAL); + hasChildren.add(ord, false); + } + + dimEndOrd++; } - if (lastDim != null) { - prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount - 1)); + prefixToDimTree.put(dim, new DimTree(dimStartOrd, siblings, hasChildren)); + + return dimEndOrd; + } + + // returns last ord of dimension + private int createOneFlatFacetDimState(SortedSetDocValues dv, int dimStartOrd) + throws IOException { + + int dimEndOrd = dimStartOrd; + + BytesRef nextTerm = dv.lookupOrd(dimEndOrd); + String[] nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString()); + if (nextComponents.length != 2) { + throw new IllegalArgumentException( + "dimension not configured to handle hierarchical field; got: " + + Arrays.toString(nextComponents) + + " " + + nextTerm.utf8ToString()); } + String dim = nextComponents[0]; + + while (true) { + String[] components = nextComponents; + + if (dimEndOrd + 1 == valueCount) { + break; + } + + nextTerm = dv.lookupOrd(dimEndOrd + 1); + nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString()); + + if (nextComponents[0].equals(components[0]) == false) { + break; + } + + if (nextComponents.length != 2) { + throw new IllegalArgumentException( + "dimension not configured to handle hierarchical field; got: " + + Arrays.toString(nextComponents) + + " " + + nextTerm.utf8ToString()); + } + + dimEndOrd++; + } + prefixToOrdRange.put(dim, new OrdRange(dimStartOrd, dimEndOrd)); + + return dimEndOrd; } /** Return the memory usage of this object in bytes. Negative values are illegal. */ @@ -194,18 +320,6 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead return new MultiSortedSetDocValues(values, starts, map, cost); } - /** Returns mapping from prefix to {@link OrdRange}. */ - @Override - public Map getPrefixToOrdRange() { - return prefixToOrdRange; - } - - /** Returns the {@link OrdRange} for this dimension. */ - @Override - public OrdRange getOrdRange(String dim) { - return prefixToOrdRange.get(dim); - } - /** Indexed field we are reading. */ @Override public String getField() { @@ -222,4 +336,72 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead public int getSize() { return valueCount; } + + @Override + public FacetsConfig getFacetsConfig() { + return config; + } + + @Override + public Iterable getDims() { + return () -> + new Iterator<>() { + + final Iterator dimTreeIterator = prefixToDimTree.keySet().iterator(); + final Iterator ordRangeIterator = prefixToOrdRange.keySet().iterator(); + + @Override + public boolean hasNext() { + return ordRangeIterator.hasNext() || dimTreeIterator.hasNext(); + } + + @Override + public String next() { + if (dimTreeIterator.hasNext()) { + return dimTreeIterator.next(); + } else if (ordRangeIterator.hasNext()) { + return ordRangeIterator.next(); + } else { + return null; + } + } + }; + } + + /* Flat facet operations */ + + @Override + public Map getPrefixToOrdRange() { + return prefixToOrdRange; + } + + @Override + public OrdRange getOrdRange(String dim) { + if (config != null && config.getDimConfig(dim).hierarchical) { + throw new UnsupportedOperationException( + "This operation is only supported for flat dimensions"); + } + return prefixToOrdRange.get(dim); + } + + /* Hierarchical facet operations */ + + @Override + public DimTree getDimTree(String dim) { + if (config == null || config.getDimConfig(dim).hierarchical == false) { + throw new UnsupportedOperationException( + "This opperation is only supported for hierarchical facets"); + } + return prefixToDimTree.get(dim); + } + + private static final class OrdAndComponent { + int ord; + String[] component; + + public OrdAndComponent(int ord, String[] component) { + this.ord = ord; + this.component = component; + } + } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java index 19ff96ffa52..10351fc7aab 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java @@ -22,7 +22,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; -import java.util.Map; +import java.util.Objects; +import java.util.PrimitiveIterator; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetUtils; import org.apache.lucene.facet.Facets; @@ -31,6 +32,7 @@ import org.apache.lucene.facet.FacetsCollector.MatchingDocs; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.TopOrdAndIntQueue; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.DimTree; import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; @@ -66,10 +68,13 @@ import org.apache.lucene.util.LongValues; public class SortedSetDocValuesFacetCounts extends Facets { final SortedSetDocValuesReaderState state; + final FacetsConfig stateConfig; final SortedSetDocValues dv; final String field; final int[] counts; + private static final String[] emptyPath = new String[0]; + /** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */ public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state) throws IOException { this(state, null); @@ -80,8 +85,9 @@ public class SortedSetDocValuesFacetCounts extends Facets { throws IOException { this.state = state; this.field = state.getField(); - dv = state.getDocValues(); - counts = new int[state.getSize()]; + this.stateConfig = Objects.requireNonNullElse(state.getFacetsConfig(), new FacetsConfig()); + this.dv = state.getDocValues(); + this.counts = new int[state.getSize()]; if (hits == null) { // browse only countAll(); @@ -95,17 +101,32 @@ public class SortedSetDocValuesFacetCounts extends Facets { if (topN <= 0) { throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")"); } - if (path.length > 0) { - throw new IllegalArgumentException("path should be 0 length"); + + if (stateConfig.getDimConfig(dim).hierarchical) { + int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path))); + if (pathOrd < 0) { + // path was never indexed + return null; + } + DimTree dimTree = state.getDimTree(dim); + return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN); + } else { + if (path.length > 0) { + throw new IllegalArgumentException( + "Field is not configured as hierarchical, path should be 0 length"); + } + OrdRange ordRange = state.getOrdRange(dim); + if (ordRange == null) { + // means dimension was never indexed + return null; + } + return getDim(dim, null, -1, ordRange.iterator(), topN); } - OrdRange ordRange = state.getOrdRange(dim); - if (ordRange == null) { - return null; // means dimension was never indexed - } - return getDim(dim, ordRange, topN); } - private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException { + private FacetResult getDim( + String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN) + throws IOException { TopOrdAndIntQueue q = null; @@ -115,7 +136,8 @@ public class SortedSetDocValuesFacetCounts extends Facets { int childCount = 0; TopOrdAndIntQueue.OrdAndValue reuse = null; - for (int ord = ordRange.start; ord <= ordRange.end; ord++) { + while (childOrds.hasNext()) { + int ord = childOrds.next(); if (counts[ord] > 0) { dimCount += counts[ord]; childCount++; @@ -145,12 +167,19 @@ public class SortedSetDocValuesFacetCounts extends Facets { LabelAndValue[] labelValues = new LabelAndValue[q.size()]; for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); + assert ordAndValue != null; final BytesRef term = dv.lookupOrd(ordAndValue.ord); String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); - labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value); + labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value); } - return new FacetResult(dim, new String[0], dimCount, labelValues, childCount); + if (pathOrd == -1) { + // not hierarchical facet + return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount); + } else { + // hierarchical facet + return new FacetResult(dim, path, counts[pathOrd], labelValues, childCount); + } } private void countOneSegment( @@ -317,10 +346,19 @@ public class SortedSetDocValuesFacetCounts extends Facets { public List getAllDims(int topN) throws IOException { List results = new ArrayList<>(); - for (Map.Entry ent : state.getPrefixToOrdRange().entrySet()) { - FacetResult fr = getDim(ent.getKey(), ent.getValue(), topN); - if (fr != null) { - results.add(fr); + for (String dim : state.getDims()) { + if (stateConfig.getDimConfig(dim).hierarchical) { + DimTree dimTree = state.getDimTree(dim); + FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN); + if (fr != null) { + results.add(fr); + } + } else { + OrdRange ordRange = state.getOrdRange(dim); + FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN); + if (fr != null) { + results.add(fr); + } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetField.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetField.java index 68fd5406f53..6138c6b150d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetField.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetField.java @@ -19,6 +19,7 @@ package org.apache.lucene.facet.sortedset; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.facet.FacetField; +import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.index.IndexOptions; /** @@ -40,20 +41,39 @@ public class SortedSetDocValuesFacetField extends Field { /** Dimension. */ public final String dim; - /** Label. */ - public final String label; + /** Path. */ + public final String[] path; + + /** + * String form of path. + * + * @deprecated This field will be removed in a future version. {@link + * FacetsConfig#pathToString(String[])} can be applied to {@code path} as a replacement if + * string path is desired. + */ + @Deprecated public final String label; /** Sole constructor. */ - public SortedSetDocValuesFacetField(String dim, String label) { + public SortedSetDocValuesFacetField(String dim, String... path) { super("dummy", TYPE); - FacetField.verifyLabel(label); + for (String label : path) { + FacetField.verifyLabel(label); + } FacetField.verifyLabel(dim); + if (path.length == 0) { + throw new IllegalArgumentException("path must have at least one element"); + } this.dim = dim; - this.label = label; + this.path = path; + this.label = FacetsConfig.pathToString(path); } @Override public String toString() { - return "SortedSetDocValuesFacetField(dim=" + dim + " label=" + label + ")"; + return "SortedSetDocValuesFacetField(dim=" + + dim + + " path=" + + FacetsConfig.pathToString(path) + + ")"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java index 1c2bc412333..0d712d599f2 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java @@ -17,10 +17,14 @@ package org.apache.lucene.facet.sortedset; import java.io.IOException; +import java.util.List; import java.util.Map; +import java.util.PrimitiveIterator; +import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.FixedBitSet; /** * Wraps a {@link IndexReader} and resolves ords using existing {@link SortedSetDocValues} APIs @@ -36,10 +40,7 @@ import org.apache.lucene.util.Accountable; */ public abstract class SortedSetDocValuesReaderState implements Accountable { - /** - * Holds start/end range of ords, which maps to one dimension (someday we may generalize it to map - * to hierarchies within one dimension). - */ + /** Holds start/end range of ords, which maps to one dimension. Only used for flat hierarchies. */ public static final class OrdRange { /** Start of range, inclusive: */ public final int start; @@ -51,8 +52,111 @@ public abstract class SortedSetDocValuesReaderState implements Accountable { this.start = start; this.end = end; } + + /** Iterates from start to end ord (inclusive) */ + public PrimitiveIterator.OfInt iterator() { + return new PrimitiveIterator.OfInt() { + int current = start; + + @Override + public int nextInt() { + if (current > end) { + return INVALID_ORDINAL; + } + return current++; + } + + @Override + public boolean hasNext() { + return current <= end; + } + }; + } } + /** + * Holds children and sibling information for a single dimension. Only used with hierarchical + * dimensions. + */ + public static final class DimTree { + private final FixedBitSet hasChildren; + // TODO: This array can take up a lot of space. Change type based on input size maybe? + private final int[] siblings; + + /** The first ord of the dimension */ + public final int dimStartOrd; + + /** Sibling and children must be of same length */ + public DimTree(int dimStartOrd, List sibling, List hasChildren) { + if (sibling.size() != hasChildren.size()) { + throw new IllegalArgumentException( + "Sibling list and children list must have the same size. Got sibling list size of " + + sibling.size() + + " and child list size of " + + hasChildren.size()); + } + this.hasChildren = new FixedBitSet(hasChildren.size()); + this.siblings = new int[sibling.size()]; + for (int i = 0; i < sibling.size(); i++) { + if (hasChildren.get(i)) { + assert i < sibling.size() - 1; + this.hasChildren.set(i); + } + assert this.siblings[i] < sibling.size(); + this.siblings[i] = sibling.get(i); + } + this.dimStartOrd = dimStartOrd; + } + + /** Iterates through all first level children of dimension */ + public PrimitiveIterator.OfInt iterator() { + return iterator(dimStartOrd); + } + + /** Iterates through all children of given pathOrd */ + public PrimitiveIterator.OfInt iterator(int pathOrd) { + return new PrimitiveIterator.OfInt() { + + boolean atStart = true; + int currentOrd = pathOrd - dimStartOrd; + + @Override + public int nextInt() { + if (atStart) { + if (currentOrd < 0 || currentOrd >= hasChildren.length()) { + return INVALID_ORDINAL; + } + atStart = false; + if (hasChildren.get(currentOrd)) { + currentOrd++; + return currentOrd + dimStartOrd; + } else { + return INVALID_ORDINAL; + } + } else { + currentOrd = siblings[currentOrd]; + return currentOrd + dimStartOrd; + } + } + + @Override + public boolean hasNext() { + if (atStart) { + if (currentOrd < 0 || currentOrd >= hasChildren.length()) { + return false; + } + return hasChildren.get(currentOrd); + } else { + return siblings[currentOrd] != INVALID_ORDINAL; + } + } + }; + } + } + + /** Invalid ordinal const */ + public static final int INVALID_ORDINAL = -1; + /** Sole constructor. */ protected SortedSetDocValuesReaderState() {} @@ -62,15 +166,28 @@ public abstract class SortedSetDocValuesReaderState implements Accountable { /** Indexed field we are reading. */ public abstract String getField(); + /** Returns top-level index reader. */ + public abstract IndexReader getReader(); + + /** Number of unique labels. */ + public abstract int getSize(); + + /** Returns the associated facet config. */ + public abstract FacetsConfig getFacetsConfig(); + + /* Only used for flat facets (dim/value) */ + /** Returns the {@link OrdRange} for this dimension. */ public abstract OrdRange getOrdRange(String dim); /** Returns mapping from prefix to {@link OrdRange}. */ public abstract Map getPrefixToOrdRange(); - /** Returns top-level index reader. */ - public abstract IndexReader getReader(); + /* Only used for hierarchical facets */ - /** Number of unique labels. */ - public abstract int getSize(); + /** Returns mapping from prefix to {@link DimTree} */ + public abstract DimTree getDimTree(String dim); + + /** Returns a list of all dimensions */ + public abstract Iterable getDims(); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java index e83b3868388..4ad5664e3cb 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java @@ -223,7 +223,7 @@ public abstract class FacetTestCase extends LuceneTestCase { } else if (b.value.doubleValue() > a.value.doubleValue()) { return 1; } else { - return 0; + return a.dim.compareTo(b.dim); } } }); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java index d6e9b0b3bf2..3f72a769a8e 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java @@ -18,14 +18,19 @@ package org.apache.lucene.facet.sortedset; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.Stack; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; @@ -47,6 +52,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NamedThreadFactory; public class TestSortedSetDocValuesFacets extends FacetTestCase { @@ -85,6 +91,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { try { Facets facets = getAllFacets(searcher, state, exec); + // value should ideally be 2 but SSDV facets are bugged here assertEquals( "dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n", facets.getTopChildren(10, "a").toString()); @@ -105,6 +112,84 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testBasicHierarchical() throws Exception { + FacetsConfig config = new FacetsConfig(); + config.setMultiValued("a", true); + config.setMultiValued("c", true); + config.setHierarchical("c", true); + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + doc.add(new SortedSetDocValuesFacetField("a", "bar")); + doc.add(new SortedSetDocValuesFacetField("a", "zoo")); + doc.add(new SortedSetDocValuesFacetField("b", "baz")); + doc.add(new SortedSetDocValuesFacetField("c", "buzz")); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bee")); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bif")); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bif", "baf")); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "biz")); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "biz", "bar")); + writer.addDocument(config.build(doc)); + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bif", "baf")); + writer.addDocument(config.build(doc)); + + // NRT open + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + ExecutorService exec = randomExecutorServiceOrNull(); + try { + Facets facets = getAllFacets(searcher, state, exec); + + // since a is not set to be hierarchical, it's value count will be bugged as ancestral + // paths are not indexed + assertEquals( + "dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n", + facets.getTopChildren(10, "a").toString()); + assertEquals( + "dim=b path=[] value=1 childCount=1\n baz (1)\n", + facets.getTopChildren(10, "b").toString()); + assertEquals( + "dim=c path=[buzz] value=2 childCount=3\n bif (2)\n bee (1)\n biz (1)\n", + facets.getTopChildren(10, "c", "buzz").toString()); + assertEquals( + "dim=c path=[buzz, bif] value=2 childCount=1\n baf (2)\n", + facets.getTopChildren(10, "c", "buzz", "bif").toString()); + + // DrillDown: + DrillDownQuery q = new DrillDownQuery(config); + q.add("a", "foo"); + q.add("b", "baz"); + TopDocs hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("c", "buzz", "bif"); + hits = searcher.search(q, 2); + assertEquals(2, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("c", "buzz", "biz", "bar"); + hits = searcher.search(q, 2); + assertEquals(1, hits.totalHits.value); + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + // See: LUCENE-10070 public void testCountAll() throws Exception { try (Directory dir = newDirectory(); @@ -158,6 +243,67 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testHierarchicalCountAll() throws Exception { + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("b", true); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + doc.add(new SortedSetDocValuesFacetField("b", "buzz", "bee")); + writer.addDocument(config.build(doc)); + + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedSetDocValuesFacetField("a", "bar")); + doc.add(new SortedSetDocValuesFacetField("b", "buzz", "baz")); + writer.addDocument(config.build(doc)); + + writer.deleteDocuments(new Term("id", "0")); + + // NRT open + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + Facets facets = new SortedSetDocValuesFacetCounts(state); + + assertEquals( + "dim=a path=[] value=1 childCount=1\n bar (1)\n", + facets.getTopChildren(10, "a").toString()); + assertEquals( + "dim=b path=[buzz] value=1 childCount=1\n baz (1)\n", + facets.getTopChildren(10, "b", "buzz").toString()); + + ExecutorService exec = + new ThreadPoolExecutor( + 1, + TestUtil.nextInt(random(), 2, 6), + Long.MAX_VALUE, + TimeUnit.MILLISECONDS, + new LinkedBlockingQueue(), + new NamedThreadFactory("TestIndexSearcher")); + try { + facets = new ConcurrentSortedSetDocValuesFacetCounts(state, exec); + + assertEquals( + "dim=a path=[] value=1 childCount=1\n bar (1)\n", + facets.getTopChildren(10, "a").toString()); + assertEquals( + "dim=b path=[buzz] value=1 childCount=1\n baz (1)\n", + facets.getTopChildren(10, "b", "buzz").toString()); + } finally { + exec.shutdownNow(); + } + } + } + } + public void testBasicSingleValued() throws Exception { FacetsConfig config = new FacetsConfig(); config.setMultiValued("a", false); @@ -210,6 +356,57 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testHierarchicalBasicSingleValues() throws Exception { + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("c", true); + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bar")); + writer.addDocument(config.build(doc)); + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("c", "buzz", "baz")); + writer.addDocument(config.build(doc)); + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("c", "baz")); + writer.addDocument(config.build(doc)); + + // NRT open + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + ExecutorService exec = randomExecutorServiceOrNull(); + try { + Facets facets = getAllFacets(searcher, state, exec); + + assertEquals( + "dim=c path=[buzz] value=2 childCount=2\n bar (1)\n baz (1)\n", + facets.getTopChildren(10, "c", "buzz").toString()); + + DrillDownQuery q = new DrillDownQuery(config); + q.add("c", "buzz"); + TopDocs hits = searcher.search(q, 1); + assertEquals(2, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("c", "buzz", "bar"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + public void testDrillDownOptions() throws Exception { FacetsConfig config = new FacetsConfig(); config.setDrillDownTermsIndexing("c", FacetsConfig.DrillDownTermsIndexing.NONE); @@ -293,6 +490,144 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testHierarchicalDrillDownOptions() throws Exception { + FacetsConfig config = new FacetsConfig(); + config.setDrillDownTermsIndexing("c", FacetsConfig.DrillDownTermsIndexing.NONE); + config.setDrillDownTermsIndexing( + "d", FacetsConfig.DrillDownTermsIndexing.DIMENSION_AND_FULL_PATH); + config.setDrillDownTermsIndexing("e", FacetsConfig.DrillDownTermsIndexing.ALL_PATHS_NO_DIM); + config.setDrillDownTermsIndexing("f", FacetsConfig.DrillDownTermsIndexing.FULL_PATH_ONLY); + config.setDrillDownTermsIndexing("g", FacetsConfig.DrillDownTermsIndexing.ALL); + config.setHierarchical("c", true); + config.setHierarchical("d", true); + config.setHierarchical("e", true); + config.setHierarchical("f", true); + config.setHierarchical("g", true); + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("c", "biz", "baz")); + doc.add(new SortedSetDocValuesFacetField("d", "biz", "baz")); + doc.add(new SortedSetDocValuesFacetField("e", "biz", "baz")); + doc.add(new SortedSetDocValuesFacetField("f", "biz", "baz")); + doc.add(new SortedSetDocValuesFacetField("g", "biz", "baz")); + writer.addDocument(config.build(doc)); + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo")); + writer.addDocument(config.build(doc)); + + // NRT open + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + // Drill down with different indexing configuration options + DrillDownQuery q = new DrillDownQuery(config); + q.add("c"); + TopDocs hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("c", "biz"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("c", "biz", "baz"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("c", "foo"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("d"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("d", "foo"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("d", "biz"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("d", "biz", "baz"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("e"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("e", "foo"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("e", "biz"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("e", "biz", "baz"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("f"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("f", "foo"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("f", "biz"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("f", "biz", "baz"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("g"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("g", "foo"); + hits = searcher.search(q, 1); + assertEquals(0, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("g", "biz"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + + q = new DrillDownQuery(config); + q.add("g", "biz", "baz"); + hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits.value); + } + } + } + // LUCENE-5090 @SuppressWarnings("unused") public void testStaleState() throws Exception { @@ -405,6 +740,72 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testHierarchicalSparseFacets() throws Exception { + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("d", true); + config.setHierarchical("e", true); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("d", "foo", "bar")); + writer.addDocument(config.build(doc)); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("d", "foo", "baz")); + writer.addDocument(config.build(doc)); + + if (random().nextBoolean()) { + writer.commit(); + } + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("e", "biz", "baz")); + writer.addDocument(config.build(doc)); + + // NRT open + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + ExecutorService exec = randomExecutorServiceOrNull(); + try { + Facets facets = getAllFacets(searcher, state, exec); + + // Ask for top 10 labels for any dims that have counts: + List results = facets.getAllDims(10); + + assertEquals(2, results.size()); + assertEquals( + "dim=d path=[] value=2 childCount=1\n foo (2)\n", results.get(0).toString()); + assertEquals( + "dim=e path=[] value=1 childCount=1\n biz (1)\n", results.get(1).toString()); + + Collection resources = state.getChildResources(); + assertTrue(state.toString().contains(FacetsConfig.DEFAULT_INDEX_FIELD_NAME)); + if (searcher.getIndexReader().leaves().size() > 1) { + assertTrue(state.ramBytesUsed() > 0); + assertFalse(resources.isEmpty()); + assertTrue(resources.toString().contains(FacetsConfig.DEFAULT_INDEX_FIELD_NAME)); + } else { + assertEquals(0, state.ramBytesUsed()); + assertTrue(resources.isEmpty()); + } + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + public void testSomeSegmentsMissing() throws Exception { try (Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { @@ -448,6 +849,58 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testHierarchicalSomeSegmentsMissing() throws Exception { + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("b", true); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo1")); + doc.add(new SortedSetDocValuesFacetField("b", "foo", "bar")); + writer.addDocument(config.build(doc)); + writer.commit(); + + doc = new Document(); + writer.addDocument(config.build(doc)); + writer.commit(); + + doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("a", "foo2")); + doc.add(new SortedSetDocValuesFacetField("b", "foo", "buzz")); + writer.addDocument(config.build(doc)); + writer.commit(); + + // NRT open + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + ExecutorService exec = randomExecutorServiceOrNull(); + try { + Facets facets = getAllFacets(searcher, state, exec); + + // Ask for top 10 labels for any dims that have counts: + assertEquals( + "dim=a path=[] value=2 childCount=2\n foo1 (1)\n foo2 (1)\n", + facets.getTopChildren(10, "a").toString()); + assertEquals( + "dim=b path=[] value=2 childCount=1\n foo (2)\n", + facets.getTopChildren(10, "b").toString()); + assertEquals( + "dim=b path=[foo] value=2 childCount=2\n bar (1)\n buzz (1)\n", + facets.getTopChildren(10, "b", "foo").toString()); + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + public void testRandom() throws Exception { int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3; for (int fullIter = 0; fullIter < fullIterations; fullIter++) { @@ -562,6 +1015,274 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testRandomHierarchicalFlatMix() throws Exception { + int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3; + for (int fullIter = 0; fullIter < fullIterations; fullIter++) { + String[] tokens = getRandomTokens(10); + + try (Directory indexDir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), indexDir)) { + FacetsConfig config = new FacetsConfig(); + int numDocs = atLeast(1000); + // Most of the time allow up to 7 dims per doc, but occasionally limit all docs to a single + // dim: + int numDims; + if (random().nextInt(10) < 8) { + numDims = TestUtil.nextInt(random(), 1, 7); + } else { + numDims = 1; + } + boolean[] hierarchicalDims = new boolean[numDims]; + for (int i = 0; i < numDims; i++) { + boolean isHierarchicalDim = random().nextBoolean(); + config.setHierarchical("dim" + i, isHierarchicalDim); + hierarchicalDims[i] = isHierarchicalDim; + } + List testDocs = getRandomDocs(tokens, numDocs, numDims); + List> testDocFacets = new ArrayList<>(); + for (TestDoc testDoc : testDocs) { + Document doc = new Document(); + Set docFacets = new HashSet<>(); + doc.add(newStringField("content", testDoc.content, Field.Store.NO)); + for (int i = 0; i < numDims; i++) { + if (hierarchicalDims[i]) { + int pathLength; + if (numDims == 1) { + pathLength = 1; + } else { + pathLength = random().nextInt(numDims - 1) + 1; + } + List path = new ArrayList<>(); + for (int j = 0; j < pathLength; j++) { + if (testDoc.dims[j] != null) { + path.add(testDoc.dims[j]); + } + } + doc.add(new SortedSetDocValuesFacetField("dim" + i, path.toArray(String[]::new))); + for (int j = 0; j < path.size(); j++) { + docFacets.add( + new SortedSetDocValuesFacetField( + "dim" + i, path.subList(0, j + 1).toArray(String[]::new))); + } + } else if (testDoc.dims[i] != null) { + doc.add(new SortedSetDocValuesFacetField("dim" + i, testDoc.dims[i])); + docFacets.add(new SortedSetDocValuesFacetField("dim" + i, testDoc.dims[i])); + } + } + testDocFacets.add(docFacets); + w.addDocument(config.build(doc)); + } + + // NRT open + try (IndexReader r = w.getReader()) { + IndexSearcher searcher = newSearcher(r); + + // Per-top-reader state: + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + ExecutorService exec = randomExecutorServiceOrNull(); + try { + int iters = atLeast(100); + for (int iter = 0; iter < iters; iter++) { + String searchToken = tokens[random().nextInt(tokens.length)]; + if (VERBOSE) { + System.out.println("\nTEST: iter content=" + searchToken); + } + FacetsCollector fc = new FacetsCollector(); + FacetsCollector.search( + searcher, new TermQuery(new Term("content", searchToken)), 10, fc); + Facets facets; + if (exec != null) { + facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec); + } else { + facets = new SortedSetDocValuesFacetCounts(state, fc); + } + // Slow, yet hopefully bug-free, faceting: + Map expectedResults = new HashMap<>(); + + for (int i = 0; i < testDocs.size(); i++) { + TestDoc doc = testDocs.get(i); + if (doc.content.equals(searchToken)) { + // goes through all facets paths in the doc + for (SortedSetDocValuesFacetField facetField : testDocFacets.get(i)) { + String[] path = facetField.path; + String parentDimPathString; + if (path.length == 1) { + parentDimPathString = facetField.dim; + } else { + parentDimPathString = + facetField.dim + + FacetsConfig.DELIM_CHAR + + FacetsConfig.pathToString(path, path.length - 1); + } + FacetResult result = expectedResults.get(parentDimPathString); + if (result == null) { + String[] resultPath = new String[path.length - 1]; + System.arraycopy(path, 0, resultPath, 0, resultPath.length); + result = + new FacetResult(facetField.dim, resultPath, 0, new LabelAndValue[0], 0); + } + String child = path[path.length - 1]; + LabelAndValue[] labelAndValues = result.labelValues; + boolean containsChild = false; + for (int k = 0; k < labelAndValues.length; k++) { + if (labelAndValues[k].label.equals(child)) { + containsChild = true; + labelAndValues[k] = + new LabelAndValue( + labelAndValues[k].label, labelAndValues[k].value.intValue() + 1); + break; + } + } + LabelAndValue[] newLabelAndValues; + int childCount = result.childCount; + if (containsChild == false) { + newLabelAndValues = new LabelAndValue[labelAndValues.length + 1]; + System.arraycopy( + labelAndValues, 0, newLabelAndValues, 0, labelAndValues.length); + newLabelAndValues[newLabelAndValues.length - 1] = new LabelAndValue(child, 1); + childCount++; + } else { + newLabelAndValues = labelAndValues; + } + newLabelAndValues = + Arrays.stream(newLabelAndValues) + .sorted( + (o1, o2) -> { + if (o1.value.equals(o2.value)) { + return new BytesRef(o1.label).compareTo(new BytesRef(o2.label)); + } else { + return o2.value.intValue() - o1.value.intValue(); + } + }) + .collect(Collectors.toList()) + .toArray(LabelAndValue[]::new); + FacetResult newResult = + new FacetResult(result.dim, result.path, 0, newLabelAndValues, childCount); + expectedResults.put(parentDimPathString, newResult); + } + } + } + + // second pass to update values + for (int i = 0; i < testDocs.size(); i++) { + TestDoc doc = testDocs.get(i); + if (doc.content.equals(searchToken)) { + Set dimsCounted = new HashSet<>(); + for (SortedSetDocValuesFacetField facetField : testDocFacets.get(i)) { + String dimPathString = + FacetsConfig.pathToString(facetField.dim, facetField.path); + FacetResult result = expectedResults.get(dimPathString); + FacetResult dimResult = expectedResults.get(facetField.dim); + if (result != null) { + expectedResults.put( + dimPathString, + new FacetResult( + result.dim, + result.path, + result.value.intValue() + 1, + result.labelValues, + result.childCount)); + } + if (dimResult != null && dimsCounted.add(facetField.dim)) { + expectedResults.put( + facetField.dim, + new FacetResult( + dimResult.dim, + dimResult.path, + dimResult.value.intValue() + 1, + dimResult.labelValues, + dimResult.childCount)); + } + } + } + } + + List expected = new ArrayList<>(expectedResults.values()); + + List expectedAllDims = new ArrayList<>(); + for (FacetResult result : expected) { + if (result.path.length == 0) { + expectedAllDims.add(result); + if (expectedAllDims.size() >= 10) { + break; + } + } + } + sortFacetResults(expectedAllDims); + + List actualAllDims = facets.getAllDims(10); + + assertEquals(expectedAllDims, actualAllDims); + + // Dfs through top children + for (FacetResult dimResult : actualAllDims) { + if (config.getDimConfig(dimResult.dim).hierarchical) { + Stack stack = new Stack<>(); + for (LabelAndValue labelAndValue : dimResult.labelValues) { + String[] path = new String[1]; + path[0] = labelAndValue.label; + stack.add(path); + } + while (stack.empty() == false) { + String[] currPath = stack.pop(); + FacetResult expectedResult = + getFacetResultForPath(expected, dimResult.dim, currPath); + FacetResult actualResult = facets.getTopChildren(10, dimResult.dim, currPath); + try { + assertEquals(expectedResult, actualResult); + } catch (AssertionError e) { + System.out.println(iter); + System.out.println(config.getDimConfig(dimResult.dim).hierarchical); + throw e; + } + if (actualResult != null) { + for (LabelAndValue labelAndValue : actualResult.labelValues) { + String[] path = new String[currPath.length + 1]; + System.arraycopy(currPath, 0, path, 0, currPath.length); + path[path.length - 1] = labelAndValue.label; + stack.add(path); + } + } + } + } + } + } + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + } + + private static FacetResult getFacetResultForPath( + List allPaths, String dim, String[] path) { + for (FacetResult result : allPaths) { + if (path.length == 0) { + if (result.path.length == 0 && result.dim.equals(dim)) { + return result; + } + } else { + boolean isEqualPath = true; + if (path.length != result.path.length) { + isEqualPath = false; + } else { + for (int i = 0; i < path.length; i++) { + if (path[i].equals(result.path[i]) == false) { + isEqualPath = false; + break; + } + } + } + if (isEqualPath && result.dim.equals(dim)) { + return result; + } + } + } + return null; + } + public void testNonExistentDimension() throws Exception { try (Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { @@ -592,6 +1313,75 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } } + public void testHierarchicalNonExistentDimension() throws Exception { + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("fizz", true); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("foo", "bar")); + doc.add(new SortedSetDocValuesFacetField("fizz", "buzz", "baz")); + writer.addDocument(config.build(doc)); + writer.commit(); + + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + ExecutorService exec = randomExecutorServiceOrNull(); + try { + Facets facets = getAllFacets(searcher, state, exec); + FacetResult result = facets.getTopChildren(5, "non-existent dimension"); + + // make sure the result is null (and no exception was thrown) + assertNull(result); + + expectThrows( + IllegalArgumentException.class, + () -> { + facets.getTopChildren(5, "non-existent dimension", "with a path"); + }); + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + + public void testHierarchicalNonExistentPath() throws Exception { + try (Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("fizz", true); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesFacetField("fizz", "buzz", "baz")); + writer.addDocument(config.build(doc)); + writer.commit(); + + try (IndexReader r = writer.getReader()) { + IndexSearcher searcher = newSearcher(r); + + SortedSetDocValuesReaderState state = + new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config); + + ExecutorService exec = randomExecutorServiceOrNull(); + try { + Facets facets = getAllFacets(searcher, state, exec); + FacetResult result = facets.getTopChildren(5, "fizz", "fake", "path"); + + // make sure the result is null (and no exception was thrown) + assertNull(result); + } finally { + if (exec != null) exec.shutdownNow(); + } + } + } + } + private static Facets getAllFacets( IndexSearcher searcher, SortedSetDocValuesReaderState state, ExecutorService exec) throws IOException, InterruptedException { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestFacetLabel.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestFacetLabel.java index 90403610a62..ee612dbfd8a 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestFacetLabel.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestFacetLabel.java @@ -237,11 +237,6 @@ public class TestFacetLabel extends FacetTestCase { () -> { new SortedSetDocValuesFacetField("", "abc"); }); - expectThrows( - IllegalArgumentException.class, - () -> { - new SortedSetDocValuesFacetField("dim", null); - }); expectThrows( IllegalArgumentException.class, () -> {