LUCENE-10250: Add support for arbitrary length hierarchical SSDV facets (#509)

This commit is contained in:
Marc D'mello 2022-01-10 08:52:14 -08:00 committed by GitHub
parent e750f6cd37
commit b4e27f2c63
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 1298 additions and 103 deletions

View File

@ -471,19 +471,35 @@ public class FacetsConfig {
private void processSSDVFacetFields( private void processSSDVFacetFields(
Map<String, List<SortedSetDocValuesFacetField>> byField, Document doc) { Map<String, List<SortedSetDocValuesFacetField>> byField, Document doc) {
for (Map.Entry<String, List<SortedSetDocValuesFacetField>> ent : byField.entrySet()) { for (Map.Entry<String, List<SortedSetDocValuesFacetField>> ent : byField.entrySet()) {
String indexFieldName = ent.getKey(); String indexFieldName = ent.getKey();
for (SortedSetDocValuesFacetField facetField : ent.getValue()) { for (SortedSetDocValuesFacetField facetField : ent.getValue()) {
FacetLabel facetLabel = new FacetLabel(facetField.dim, facetField.label); FacetLabel facetLabel = new FacetLabel(facetField.dim, facetField.path);
String fullPath = pathToString(facetLabel.components, facetLabel.length); DimConfig dimConfig = getDimConfig(facetField.dim);
if (dimConfig.hierarchical) {
for (int i = 0; i < facetLabel.length; i++) {
String fullPath = pathToString(facetLabel.components, i + 1);
// For facet counts: // For facet counts:
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath))); doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
}
} else {
if (facetLabel.length != 2) {
throw new IllegalArgumentException(
"dimension \""
+ facetField.dim
+ "\" is not hierarchical yet has "
+ facetField.path.length
+ " components");
}
String fullPath = pathToString(facetLabel.components, facetLabel.length);
// For facet counts:
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
}
// For drill-down: // For drill-down:
indexDrillDownTerms(doc, indexFieldName, getDimConfig(facetField.dim), facetLabel); indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel);
} }
} }
} }
@ -538,7 +554,7 @@ public class FacetsConfig {
private static final char ESCAPE_CHAR = '\u001E'; private static final char ESCAPE_CHAR = '\u001E';
/** Turns a dim + path into an encoded string. */ /** Turns a dim + path into an encoded string. */
public static String pathToString(String dim, String[] path) { public static String pathToString(String dim, String... path) {
String[] fullPath = new String[1 + path.length]; String[] fullPath = new String[1 + path.length];
fullPath[0] = dim; fullPath[0] = dim;
System.arraycopy(path, 0, fullPath, 1, path.length); System.arraycopy(path, 0, fullPath, 1, path.length);

View File

@ -22,7 +22,8 @@ import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Objects;
import java.util.PrimitiveIterator;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
@ -64,10 +65,13 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
final ExecutorService exec; final ExecutorService exec;
final SortedSetDocValuesReaderState state; final SortedSetDocValuesReaderState state;
final FacetsConfig stateConfig;
final SortedSetDocValues dv; final SortedSetDocValues dv;
final String field; final String field;
final AtomicIntegerArray counts; final AtomicIntegerArray counts;
private static final String[] emptyPath = new String[0];
/** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */ /** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */
public ConcurrentSortedSetDocValuesFacetCounts( public ConcurrentSortedSetDocValuesFacetCounts(
SortedSetDocValuesReaderState state, ExecutorService exec) SortedSetDocValuesReaderState state, ExecutorService exec)
@ -81,6 +85,7 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
throws IOException, InterruptedException { throws IOException, InterruptedException {
this.state = state; this.state = state;
this.field = state.getField(); this.field = state.getField();
this.stateConfig = Objects.requireNonNullElse(state.getFacetsConfig(), new FacetsConfig());
this.exec = exec; this.exec = exec;
dv = state.getDocValues(); dv = state.getDocValues();
counts = new AtomicIntegerArray(state.getSize()); counts = new AtomicIntegerArray(state.getSize());
@ -97,17 +102,32 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
if (topN <= 0) { if (topN <= 0) {
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")"); throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
} }
if (stateConfig.getDimConfig(dim).hierarchical) {
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (pathOrd < 0) {
// path was never indexed
return null;
}
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
} else {
if (path.length > 0) { if (path.length > 0) {
throw new IllegalArgumentException("path should be 0 length"); throw new IllegalArgumentException(
"Field is not configured as hierarchical, path should be 0 length");
} }
OrdRange ordRange = state.getOrdRange(dim); OrdRange ordRange = state.getOrdRange(dim);
if (ordRange == null) { if (ordRange == null) {
return null; // means dimension was never indexed // means dimension was never indexed
return null;
}
return getDim(dim, null, -1, ordRange.iterator(), topN);
} }
return getDim(dim, ordRange, topN);
} }
private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException { private FacetResult getDim(
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
throws IOException {
TopOrdAndIntQueue q = null; TopOrdAndIntQueue q = null;
@ -118,7 +138,8 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
TopOrdAndIntQueue.OrdAndValue reuse = null; TopOrdAndIntQueue.OrdAndValue reuse = null;
for (int ord = ordRange.start; ord <= ordRange.end; ord++) { while (childOrds.hasNext()) {
int ord = childOrds.next();
if (counts.get(ord) > 0) { if (counts.get(ord) > 0) {
dimCount += counts.get(ord); dimCount += counts.get(ord);
childCount++; childCount++;
@ -148,12 +169,19 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
LabelAndValue[] labelValues = new LabelAndValue[q.size()]; LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) { for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord); final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value); labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
} }
return new FacetResult(dim, new String[0], dimCount, labelValues, childCount); if (pathOrd == -1) {
// not hierarchical facet
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
} else {
// hierarchical facet
return new FacetResult(dim, path, counts.get(pathOrd), labelValues, childCount);
}
} }
private class CountOneSegment implements Callable<Void> { private class CountOneSegment implements Callable<Void> {
@ -365,11 +393,20 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
public List<FacetResult> getAllDims(int topN) throws IOException { public List<FacetResult> getAllDims(int topN) throws IOException {
List<FacetResult> results = new ArrayList<>(); List<FacetResult> results = new ArrayList<>();
for (Map.Entry<String, OrdRange> ent : state.getPrefixToOrdRange().entrySet()) { for (String dim : state.getDims()) {
FacetResult fr = getDim(ent.getKey(), ent.getValue(), topN); if (stateConfig.getDimConfig(dim).hierarchical) {
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
if (fr != null) { if (fr != null) {
results.add(fr); results.add(fr);
} }
} else {
OrdRange ordRange = state.getOrdRange(dim);
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
if (fr != null) {
results.add(fr);
}
}
} }
// Sort by highest count: // Sort by highest count:

View File

@ -17,12 +17,15 @@
package org.apache.lucene.facet.sortedset; package org.apache.lucene.facet.sortedset;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Stack;
import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
@ -51,20 +54,42 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
private final Map<String, OrdinalMap> cachedOrdMaps = new HashMap<>(); private final Map<String, OrdinalMap> cachedOrdMaps = new HashMap<>();
private final FacetsConfig config;
/** Used for hierarchical dims. */
private final Map<String, DimTree> prefixToDimTree = new HashMap<>();
/** Used for flat dims. */
private final Map<String, OrdRange> prefixToOrdRange = new HashMap<>(); private final Map<String, OrdRange> prefixToOrdRange = new HashMap<>();
/** /**
* Creates this, pulling doc values from the default {@link * Creates this with a config, pulling doc values from the default {@link
* FacetsConfig#DEFAULT_INDEX_FIELD_NAME}.
*/
public DefaultSortedSetDocValuesReaderState(IndexReader reader, FacetsConfig config)
throws IOException {
this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME, config);
}
/**
* Creates this without a config, pulling doc values from the default {@link
* FacetsConfig#DEFAULT_INDEX_FIELD_NAME}. * FacetsConfig#DEFAULT_INDEX_FIELD_NAME}.
*/ */
public DefaultSortedSetDocValuesReaderState(IndexReader reader) throws IOException { public DefaultSortedSetDocValuesReaderState(IndexReader reader) throws IOException {
this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME); this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME, null);
}
/** Creates this without a config, pulling doc values from the specified field. */
public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
this(reader, field, null);
} }
/** Creates this, pulling doc values from the specified field. */ /** Creates this, pulling doc values from the specified field. */
public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException { public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field, FacetsConfig config)
throws IOException {
this.field = field; this.field = field;
this.reader = reader; this.reader = reader;
this.config = config;
// We need this to create thread-safe MultiSortedSetDV // We need this to create thread-safe MultiSortedSetDV
// per collector: // per collector:
@ -79,40 +104,141 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
} }
valueCount = (int) dv.getValueCount(); valueCount = (int) dv.getValueCount();
// TODO: we can make this more efficient if eg we can be int ord = 0;
// "involved" when OrdinalMap is being created? Ie see while (ord != valueCount) {
// each term/ord it's assigning as it goes... BytesRef term = dv.lookupOrd(ord);
String lastDim = null;
int startOrd = -1;
// TODO: this approach can work for full hierarchy?;
// TaxoReader can't do this since ords are not in
// "sorted order" ... but we should generalize this to
// support arbitrary hierarchy:
for (int ord = 0; ord < valueCount; ord++) {
final BytesRef term = dv.lookupOrd(ord);
String[] components = FacetsConfig.stringToPath(term.utf8ToString()); String[] components = FacetsConfig.stringToPath(term.utf8ToString());
if (components.length != 2) { String dim = components[0];
throw new IllegalArgumentException( if (config != null && config.getDimConfig(dim).hierarchical) {
"this class can only handle 2 level hierarchy (dim/value); got: " ord = createOneHierarchicalFacetDimState(dv, ord) + 1;
+ Arrays.toString(components) } else {
+ " " ord = createOneFlatFacetDimState(dv, ord) + 1;
+ term.utf8ToString());
} }
if (!components[0].equals(lastDim)) {
if (lastDim != null) {
prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord - 1));
}
startOrd = ord;
lastDim = components[0];
} }
} }
if (lastDim != null) { // returns last ord of dimension
prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount - 1)); private int createOneHierarchicalFacetDimState(SortedSetDocValues dv, int dimStartOrd)
throws IOException {
List<Boolean> hasChildren = new ArrayList<>();
List<Integer> siblings = new ArrayList<>();
// stack of paths with unfulfilled siblings
Stack<OrdAndComponent> siblingStack = new Stack<>();
int dimEndOrd = dimStartOrd;
BytesRef nextTerm = dv.lookupOrd(dimEndOrd);
String[] nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
String dim = nextComponents[0];
while (true) {
String[] components = nextComponents;
int ord = dimEndOrd - dimStartOrd;
while (siblingStack.empty() == false
&& siblingStack.peek().component.length >= components.length) {
OrdAndComponent possibleSibling = siblingStack.pop();
if (possibleSibling.component.length == components.length) {
// lengths are equal, all non-siblings of equal length will have already been popped off
// so this must be sibling
siblings.set(possibleSibling.ord, ord);
} }
} }
if (dimEndOrd + 1 == valueCount) {
// current ord needs to be added, can't have children or siblings
siblings.add(-1);
hasChildren.add(false);
break;
}
nextTerm = dv.lookupOrd(dimEndOrd + 1);
nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
if (nextComponents[0].equals(components[0]) == false) {
// current ord needs to be added, can't have children or siblings
siblings.add(-1);
hasChildren.add(false);
break;
}
if (components.length < nextComponents.length) {
// next ord must be a direct child of current ord, this is because we are indexing all
// ancestral paths
hasChildren.add(ord, true);
// we don't know if this ord has a sibling or where it's sibling could be yet
siblingStack.push(new OrdAndComponent(ord, components));
// we still add INVALID_ORDINAL, which will be replaced if a valid sibling is found
siblings.add(ord, INVALID_ORDINAL);
} else if (components.length == nextComponents.length) {
// next ord must be a sibling of current and there are no direct children of current, this
// is because we
// are indexing all ancestral paths
siblings.add(ord, ord + 1);
hasChildren.add(ord, false);
} else {
// components.length > nextComponents.length
// next ord is neither sibling nor child
siblings.add(ord, INVALID_ORDINAL);
hasChildren.add(ord, false);
}
dimEndOrd++;
}
prefixToDimTree.put(dim, new DimTree(dimStartOrd, siblings, hasChildren));
return dimEndOrd;
}
// returns last ord of dimension
private int createOneFlatFacetDimState(SortedSetDocValues dv, int dimStartOrd)
throws IOException {
int dimEndOrd = dimStartOrd;
BytesRef nextTerm = dv.lookupOrd(dimEndOrd);
String[] nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
if (nextComponents.length != 2) {
throw new IllegalArgumentException(
"dimension not configured to handle hierarchical field; got: "
+ Arrays.toString(nextComponents)
+ " "
+ nextTerm.utf8ToString());
}
String dim = nextComponents[0];
while (true) {
String[] components = nextComponents;
if (dimEndOrd + 1 == valueCount) {
break;
}
nextTerm = dv.lookupOrd(dimEndOrd + 1);
nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
if (nextComponents[0].equals(components[0]) == false) {
break;
}
if (nextComponents.length != 2) {
throw new IllegalArgumentException(
"dimension not configured to handle hierarchical field; got: "
+ Arrays.toString(nextComponents)
+ " "
+ nextTerm.utf8ToString());
}
dimEndOrd++;
}
prefixToOrdRange.put(dim, new OrdRange(dimStartOrd, dimEndOrd));
return dimEndOrd;
}
/** Return the memory usage of this object in bytes. Negative values are illegal. */ /** Return the memory usage of this object in bytes. Negative values are illegal. */
@Override @Override
public long ramBytesUsed() { public long ramBytesUsed() {
@ -194,18 +320,6 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
return new MultiSortedSetDocValues(values, starts, map, cost); return new MultiSortedSetDocValues(values, starts, map, cost);
} }
/** Returns mapping from prefix to {@link OrdRange}. */
@Override
public Map<String, OrdRange> getPrefixToOrdRange() {
return prefixToOrdRange;
}
/** Returns the {@link OrdRange} for this dimension. */
@Override
public OrdRange getOrdRange(String dim) {
return prefixToOrdRange.get(dim);
}
/** Indexed field we are reading. */ /** Indexed field we are reading. */
@Override @Override
public String getField() { public String getField() {
@ -222,4 +336,72 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
public int getSize() { public int getSize() {
return valueCount; return valueCount;
} }
@Override
public FacetsConfig getFacetsConfig() {
return config;
}
@Override
public Iterable<String> getDims() {
return () ->
new Iterator<>() {
final Iterator<String> dimTreeIterator = prefixToDimTree.keySet().iterator();
final Iterator<String> ordRangeIterator = prefixToOrdRange.keySet().iterator();
@Override
public boolean hasNext() {
return ordRangeIterator.hasNext() || dimTreeIterator.hasNext();
}
@Override
public String next() {
if (dimTreeIterator.hasNext()) {
return dimTreeIterator.next();
} else if (ordRangeIterator.hasNext()) {
return ordRangeIterator.next();
} else {
return null;
}
}
};
}
/* Flat facet operations */
@Override
public Map<String, OrdRange> getPrefixToOrdRange() {
return prefixToOrdRange;
}
@Override
public OrdRange getOrdRange(String dim) {
if (config != null && config.getDimConfig(dim).hierarchical) {
throw new UnsupportedOperationException(
"This operation is only supported for flat dimensions");
}
return prefixToOrdRange.get(dim);
}
/* Hierarchical facet operations */
@Override
public DimTree getDimTree(String dim) {
if (config == null || config.getDimConfig(dim).hierarchical == false) {
throw new UnsupportedOperationException(
"This opperation is only supported for hierarchical facets");
}
return prefixToDimTree.get(dim);
}
private static final class OrdAndComponent {
int ord;
String[] component;
public OrdAndComponent(int ord, String[] component) {
this.ord = ord;
this.component = component;
}
}
} }

View File

@ -22,7 +22,8 @@ import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Objects;
import java.util.PrimitiveIterator;
import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetUtils; import org.apache.lucene.facet.FacetUtils;
import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.Facets;
@ -31,6 +32,7 @@ import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue; import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.DimTree;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange; import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
@ -66,10 +68,13 @@ import org.apache.lucene.util.LongValues;
public class SortedSetDocValuesFacetCounts extends Facets { public class SortedSetDocValuesFacetCounts extends Facets {
final SortedSetDocValuesReaderState state; final SortedSetDocValuesReaderState state;
final FacetsConfig stateConfig;
final SortedSetDocValues dv; final SortedSetDocValues dv;
final String field; final String field;
final int[] counts; final int[] counts;
private static final String[] emptyPath = new String[0];
/** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */ /** Returns all facet counts, same result as searching on {@link MatchAllDocsQuery} but faster. */
public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state) throws IOException { public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state) throws IOException {
this(state, null); this(state, null);
@ -80,8 +85,9 @@ public class SortedSetDocValuesFacetCounts extends Facets {
throws IOException { throws IOException {
this.state = state; this.state = state;
this.field = state.getField(); this.field = state.getField();
dv = state.getDocValues(); this.stateConfig = Objects.requireNonNullElse(state.getFacetsConfig(), new FacetsConfig());
counts = new int[state.getSize()]; this.dv = state.getDocValues();
this.counts = new int[state.getSize()];
if (hits == null) { if (hits == null) {
// browse only // browse only
countAll(); countAll();
@ -95,17 +101,32 @@ public class SortedSetDocValuesFacetCounts extends Facets {
if (topN <= 0) { if (topN <= 0) {
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")"); throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
} }
if (stateConfig.getDimConfig(dim).hierarchical) {
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (pathOrd < 0) {
// path was never indexed
return null;
}
DimTree dimTree = state.getDimTree(dim);
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
} else {
if (path.length > 0) { if (path.length > 0) {
throw new IllegalArgumentException("path should be 0 length"); throw new IllegalArgumentException(
"Field is not configured as hierarchical, path should be 0 length");
} }
OrdRange ordRange = state.getOrdRange(dim); OrdRange ordRange = state.getOrdRange(dim);
if (ordRange == null) { if (ordRange == null) {
return null; // means dimension was never indexed // means dimension was never indexed
return null;
}
return getDim(dim, null, -1, ordRange.iterator(), topN);
} }
return getDim(dim, ordRange, topN);
} }
private FacetResult getDim(String dim, OrdRange ordRange, int topN) throws IOException { private FacetResult getDim(
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
throws IOException {
TopOrdAndIntQueue q = null; TopOrdAndIntQueue q = null;
@ -115,7 +136,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
int childCount = 0; int childCount = 0;
TopOrdAndIntQueue.OrdAndValue reuse = null; TopOrdAndIntQueue.OrdAndValue reuse = null;
for (int ord = ordRange.start; ord <= ordRange.end; ord++) { while (childOrds.hasNext()) {
int ord = childOrds.next();
if (counts[ord] > 0) { if (counts[ord] > 0) {
dimCount += counts[ord]; dimCount += counts[ord];
childCount++; childCount++;
@ -145,12 +167,19 @@ public class SortedSetDocValuesFacetCounts extends Facets {
LabelAndValue[] labelValues = new LabelAndValue[q.size()]; LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) { for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord); final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString()); String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value); labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
} }
return new FacetResult(dim, new String[0], dimCount, labelValues, childCount); if (pathOrd == -1) {
// not hierarchical facet
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
} else {
// hierarchical facet
return new FacetResult(dim, path, counts[pathOrd], labelValues, childCount);
}
} }
private void countOneSegment( private void countOneSegment(
@ -317,11 +346,20 @@ public class SortedSetDocValuesFacetCounts extends Facets {
public List<FacetResult> getAllDims(int topN) throws IOException { public List<FacetResult> getAllDims(int topN) throws IOException {
List<FacetResult> results = new ArrayList<>(); List<FacetResult> results = new ArrayList<>();
for (Map.Entry<String, OrdRange> ent : state.getPrefixToOrdRange().entrySet()) { for (String dim : state.getDims()) {
FacetResult fr = getDim(ent.getKey(), ent.getValue(), topN); if (stateConfig.getDimConfig(dim).hierarchical) {
DimTree dimTree = state.getDimTree(dim);
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
if (fr != null) { if (fr != null) {
results.add(fr); results.add(fr);
} }
} else {
OrdRange ordRange = state.getOrdRange(dim);
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
if (fr != null) {
results.add(fr);
}
}
} }
// Sort by highest count: // Sort by highest count:

View File

@ -19,6 +19,7 @@ package org.apache.lucene.facet.sortedset;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType; import org.apache.lucene.document.FieldType;
import org.apache.lucene.facet.FacetField; import org.apache.lucene.facet.FacetField;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
/** /**
@ -40,20 +41,39 @@ public class SortedSetDocValuesFacetField extends Field {
/** Dimension. */ /** Dimension. */
public final String dim; public final String dim;
/** Label. */ /** Path. */
public final String label; public final String[] path;
/**
* String form of path.
*
* @deprecated This field will be removed in a future version. {@link
* FacetsConfig#pathToString(String[])} can be applied to {@code path} as a replacement if
* string path is desired.
*/
@Deprecated public final String label;
/** Sole constructor. */ /** Sole constructor. */
public SortedSetDocValuesFacetField(String dim, String label) { public SortedSetDocValuesFacetField(String dim, String... path) {
super("dummy", TYPE); super("dummy", TYPE);
for (String label : path) {
FacetField.verifyLabel(label); FacetField.verifyLabel(label);
}
FacetField.verifyLabel(dim); FacetField.verifyLabel(dim);
if (path.length == 0) {
throw new IllegalArgumentException("path must have at least one element");
}
this.dim = dim; this.dim = dim;
this.label = label; this.path = path;
this.label = FacetsConfig.pathToString(path);
} }
@Override @Override
public String toString() { public String toString() {
return "SortedSetDocValuesFacetField(dim=" + dim + " label=" + label + ")"; return "SortedSetDocValuesFacetField(dim="
+ dim
+ " path="
+ FacetsConfig.pathToString(path)
+ ")";
} }
} }

View File

@ -17,10 +17,14 @@
package org.apache.lucene.facet.sortedset; package org.apache.lucene.facet.sortedset;
import java.io.IOException; import java.io.IOException;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.PrimitiveIterator;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.FixedBitSet;
/** /**
* Wraps a {@link IndexReader} and resolves ords using existing {@link SortedSetDocValues} APIs * Wraps a {@link IndexReader} and resolves ords using existing {@link SortedSetDocValues} APIs
@ -36,10 +40,7 @@ import org.apache.lucene.util.Accountable;
*/ */
public abstract class SortedSetDocValuesReaderState implements Accountable { public abstract class SortedSetDocValuesReaderState implements Accountable {
/** /** Holds start/end range of ords, which maps to one dimension. Only used for flat hierarchies. */
* Holds start/end range of ords, which maps to one dimension (someday we may generalize it to map
* to hierarchies within one dimension).
*/
public static final class OrdRange { public static final class OrdRange {
/** Start of range, inclusive: */ /** Start of range, inclusive: */
public final int start; public final int start;
@ -51,7 +52,110 @@ public abstract class SortedSetDocValuesReaderState implements Accountable {
this.start = start; this.start = start;
this.end = end; this.end = end;
} }
/** Iterates from start to end ord (inclusive) */
public PrimitiveIterator.OfInt iterator() {
return new PrimitiveIterator.OfInt() {
int current = start;
@Override
public int nextInt() {
if (current > end) {
return INVALID_ORDINAL;
} }
return current++;
}
@Override
public boolean hasNext() {
return current <= end;
}
};
}
}
/**
* Holds children and sibling information for a single dimension. Only used with hierarchical
* dimensions.
*/
public static final class DimTree {
private final FixedBitSet hasChildren;
// TODO: This array can take up a lot of space. Change type based on input size maybe?
private final int[] siblings;
/** The first ord of the dimension */
public final int dimStartOrd;
/** Sibling and children must be of same length */
public DimTree(int dimStartOrd, List<Integer> sibling, List<Boolean> hasChildren) {
if (sibling.size() != hasChildren.size()) {
throw new IllegalArgumentException(
"Sibling list and children list must have the same size. Got sibling list size of "
+ sibling.size()
+ " and child list size of "
+ hasChildren.size());
}
this.hasChildren = new FixedBitSet(hasChildren.size());
this.siblings = new int[sibling.size()];
for (int i = 0; i < sibling.size(); i++) {
if (hasChildren.get(i)) {
assert i < sibling.size() - 1;
this.hasChildren.set(i);
}
assert this.siblings[i] < sibling.size();
this.siblings[i] = sibling.get(i);
}
this.dimStartOrd = dimStartOrd;
}
/** Iterates through all first level children of dimension */
public PrimitiveIterator.OfInt iterator() {
return iterator(dimStartOrd);
}
/** Iterates through all children of given pathOrd */
public PrimitiveIterator.OfInt iterator(int pathOrd) {
return new PrimitiveIterator.OfInt() {
boolean atStart = true;
int currentOrd = pathOrd - dimStartOrd;
@Override
public int nextInt() {
if (atStart) {
if (currentOrd < 0 || currentOrd >= hasChildren.length()) {
return INVALID_ORDINAL;
}
atStart = false;
if (hasChildren.get(currentOrd)) {
currentOrd++;
return currentOrd + dimStartOrd;
} else {
return INVALID_ORDINAL;
}
} else {
currentOrd = siblings[currentOrd];
return currentOrd + dimStartOrd;
}
}
@Override
public boolean hasNext() {
if (atStart) {
if (currentOrd < 0 || currentOrd >= hasChildren.length()) {
return false;
}
return hasChildren.get(currentOrd);
} else {
return siblings[currentOrd] != INVALID_ORDINAL;
}
}
};
}
}
/** Invalid ordinal const */
public static final int INVALID_ORDINAL = -1;
/** Sole constructor. */ /** Sole constructor. */
protected SortedSetDocValuesReaderState() {} protected SortedSetDocValuesReaderState() {}
@ -62,15 +166,28 @@ public abstract class SortedSetDocValuesReaderState implements Accountable {
/** Indexed field we are reading. */ /** Indexed field we are reading. */
public abstract String getField(); public abstract String getField();
/** Returns top-level index reader. */
public abstract IndexReader getReader();
/** Number of unique labels. */
public abstract int getSize();
/** Returns the associated facet config. */
public abstract FacetsConfig getFacetsConfig();
/* Only used for flat facets (dim/value) */
/** Returns the {@link OrdRange} for this dimension. */ /** Returns the {@link OrdRange} for this dimension. */
public abstract OrdRange getOrdRange(String dim); public abstract OrdRange getOrdRange(String dim);
/** Returns mapping from prefix to {@link OrdRange}. */ /** Returns mapping from prefix to {@link OrdRange}. */
public abstract Map<String, OrdRange> getPrefixToOrdRange(); public abstract Map<String, OrdRange> getPrefixToOrdRange();
/** Returns top-level index reader. */ /* Only used for hierarchical facets */
public abstract IndexReader getReader();
/** Number of unique labels. */ /** Returns mapping from prefix to {@link DimTree} */
public abstract int getSize(); public abstract DimTree getDimTree(String dim);
/** Returns a list of all dimensions */
public abstract Iterable<String> getDims();
} }

View File

@ -223,7 +223,7 @@ public abstract class FacetTestCase extends LuceneTestCase {
} else if (b.value.doubleValue() > a.value.doubleValue()) { } else if (b.value.doubleValue() > a.value.doubleValue()) {
return 1; return 1;
} else { } else {
return 0; return a.dim.compareTo(b.dim);
} }
} }
}); });

View File

@ -18,14 +18,19 @@ package org.apache.lucene.facet.sortedset;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
@ -47,6 +52,7 @@ import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util.NamedThreadFactory;
public class TestSortedSetDocValuesFacets extends FacetTestCase { public class TestSortedSetDocValuesFacets extends FacetTestCase {
@ -85,6 +91,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
try { try {
Facets facets = getAllFacets(searcher, state, exec); Facets facets = getAllFacets(searcher, state, exec);
// value should ideally be 2 but SSDV facets are bugged here
assertEquals( assertEquals(
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n", "dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
facets.getTopChildren(10, "a").toString()); facets.getTopChildren(10, "a").toString());
@ -105,6 +112,84 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testBasicHierarchical() throws Exception {
FacetsConfig config = new FacetsConfig();
config.setMultiValued("a", true);
config.setMultiValued("c", true);
config.setHierarchical("c", true);
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
Document doc = new Document();
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
doc.add(new SortedSetDocValuesFacetField("a", "bar"));
doc.add(new SortedSetDocValuesFacetField("a", "zoo"));
doc.add(new SortedSetDocValuesFacetField("b", "baz"));
doc.add(new SortedSetDocValuesFacetField("c", "buzz"));
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bee"));
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bif"));
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bif", "baf"));
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "biz"));
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "biz", "bar"));
writer.addDocument(config.build(doc));
if (random().nextBoolean()) {
writer.commit();
}
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bif", "baf"));
writer.addDocument(config.build(doc));
// NRT open
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
// Per-top-reader state:
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
ExecutorService exec = randomExecutorServiceOrNull();
try {
Facets facets = getAllFacets(searcher, state, exec);
// since a is not set to be hierarchical, it's value count will be bugged as ancestral
// paths are not indexed
assertEquals(
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
facets.getTopChildren(10, "a").toString());
assertEquals(
"dim=b path=[] value=1 childCount=1\n baz (1)\n",
facets.getTopChildren(10, "b").toString());
assertEquals(
"dim=c path=[buzz] value=2 childCount=3\n bif (2)\n bee (1)\n biz (1)\n",
facets.getTopChildren(10, "c", "buzz").toString());
assertEquals(
"dim=c path=[buzz, bif] value=2 childCount=1\n baf (2)\n",
facets.getTopChildren(10, "c", "buzz", "bif").toString());
// DrillDown:
DrillDownQuery q = new DrillDownQuery(config);
q.add("a", "foo");
q.add("b", "baz");
TopDocs hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("c", "buzz", "bif");
hits = searcher.search(q, 2);
assertEquals(2, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("c", "buzz", "biz", "bar");
hits = searcher.search(q, 2);
assertEquals(1, hits.totalHits.value);
} finally {
if (exec != null) exec.shutdownNow();
}
}
}
}
// See: LUCENE-10070 // See: LUCENE-10070
public void testCountAll() throws Exception { public void testCountAll() throws Exception {
try (Directory dir = newDirectory(); try (Directory dir = newDirectory();
@ -158,6 +243,67 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testHierarchicalCountAll() throws Exception {
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
FacetsConfig config = new FacetsConfig();
config.setHierarchical("b", true);
Document doc = new Document();
doc.add(new StringField("id", "0", Field.Store.NO));
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
doc.add(new SortedSetDocValuesFacetField("b", "buzz", "bee"));
writer.addDocument(config.build(doc));
doc = new Document();
doc.add(new StringField("id", "1", Field.Store.NO));
doc.add(new SortedSetDocValuesFacetField("a", "bar"));
doc.add(new SortedSetDocValuesFacetField("b", "buzz", "baz"));
writer.addDocument(config.build(doc));
writer.deleteDocuments(new Term("id", "0"));
// NRT open
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
// Per-top-reader state:
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
Facets facets = new SortedSetDocValuesFacetCounts(state);
assertEquals(
"dim=a path=[] value=1 childCount=1\n bar (1)\n",
facets.getTopChildren(10, "a").toString());
assertEquals(
"dim=b path=[buzz] value=1 childCount=1\n baz (1)\n",
facets.getTopChildren(10, "b", "buzz").toString());
ExecutorService exec =
new ThreadPoolExecutor(
1,
TestUtil.nextInt(random(), 2, 6),
Long.MAX_VALUE,
TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>(),
new NamedThreadFactory("TestIndexSearcher"));
try {
facets = new ConcurrentSortedSetDocValuesFacetCounts(state, exec);
assertEquals(
"dim=a path=[] value=1 childCount=1\n bar (1)\n",
facets.getTopChildren(10, "a").toString());
assertEquals(
"dim=b path=[buzz] value=1 childCount=1\n baz (1)\n",
facets.getTopChildren(10, "b", "buzz").toString());
} finally {
exec.shutdownNow();
}
}
}
}
public void testBasicSingleValued() throws Exception { public void testBasicSingleValued() throws Exception {
FacetsConfig config = new FacetsConfig(); FacetsConfig config = new FacetsConfig();
config.setMultiValued("a", false); config.setMultiValued("a", false);
@ -210,6 +356,57 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testHierarchicalBasicSingleValues() throws Exception {
FacetsConfig config = new FacetsConfig();
config.setHierarchical("c", true);
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
Document doc = new Document();
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "bar"));
writer.addDocument(config.build(doc));
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("c", "buzz", "baz"));
writer.addDocument(config.build(doc));
if (random().nextBoolean()) {
writer.commit();
}
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("c", "baz"));
writer.addDocument(config.build(doc));
// NRT open
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
// Per-top-reader state:
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
ExecutorService exec = randomExecutorServiceOrNull();
try {
Facets facets = getAllFacets(searcher, state, exec);
assertEquals(
"dim=c path=[buzz] value=2 childCount=2\n bar (1)\n baz (1)\n",
facets.getTopChildren(10, "c", "buzz").toString());
DrillDownQuery q = new DrillDownQuery(config);
q.add("c", "buzz");
TopDocs hits = searcher.search(q, 1);
assertEquals(2, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("c", "buzz", "bar");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
} finally {
if (exec != null) exec.shutdownNow();
}
}
}
}
public void testDrillDownOptions() throws Exception { public void testDrillDownOptions() throws Exception {
FacetsConfig config = new FacetsConfig(); FacetsConfig config = new FacetsConfig();
config.setDrillDownTermsIndexing("c", FacetsConfig.DrillDownTermsIndexing.NONE); config.setDrillDownTermsIndexing("c", FacetsConfig.DrillDownTermsIndexing.NONE);
@ -293,6 +490,144 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testHierarchicalDrillDownOptions() throws Exception {
FacetsConfig config = new FacetsConfig();
config.setDrillDownTermsIndexing("c", FacetsConfig.DrillDownTermsIndexing.NONE);
config.setDrillDownTermsIndexing(
"d", FacetsConfig.DrillDownTermsIndexing.DIMENSION_AND_FULL_PATH);
config.setDrillDownTermsIndexing("e", FacetsConfig.DrillDownTermsIndexing.ALL_PATHS_NO_DIM);
config.setDrillDownTermsIndexing("f", FacetsConfig.DrillDownTermsIndexing.FULL_PATH_ONLY);
config.setDrillDownTermsIndexing("g", FacetsConfig.DrillDownTermsIndexing.ALL);
config.setHierarchical("c", true);
config.setHierarchical("d", true);
config.setHierarchical("e", true);
config.setHierarchical("f", true);
config.setHierarchical("g", true);
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
Document doc = new Document();
doc.add(new SortedSetDocValuesFacetField("c", "biz", "baz"));
doc.add(new SortedSetDocValuesFacetField("d", "biz", "baz"));
doc.add(new SortedSetDocValuesFacetField("e", "biz", "baz"));
doc.add(new SortedSetDocValuesFacetField("f", "biz", "baz"));
doc.add(new SortedSetDocValuesFacetField("g", "biz", "baz"));
writer.addDocument(config.build(doc));
if (random().nextBoolean()) {
writer.commit();
}
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
writer.addDocument(config.build(doc));
// NRT open
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
// Drill down with different indexing configuration options
DrillDownQuery q = new DrillDownQuery(config);
q.add("c");
TopDocs hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("c", "biz");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("c", "biz", "baz");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("c", "foo");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("d");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("d", "foo");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("d", "biz");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("d", "biz", "baz");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("e");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("e", "foo");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("e", "biz");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("e", "biz", "baz");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("f");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("f", "foo");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("f", "biz");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("f", "biz", "baz");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("g");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("g", "foo");
hits = searcher.search(q, 1);
assertEquals(0, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("g", "biz");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
q = new DrillDownQuery(config);
q.add("g", "biz", "baz");
hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits.value);
}
}
}
// LUCENE-5090 // LUCENE-5090
@SuppressWarnings("unused") @SuppressWarnings("unused")
public void testStaleState() throws Exception { public void testStaleState() throws Exception {
@ -405,6 +740,72 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testHierarchicalSparseFacets() throws Exception {
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
FacetsConfig config = new FacetsConfig();
config.setHierarchical("d", true);
config.setHierarchical("e", true);
Document doc = new Document();
doc.add(new SortedSetDocValuesFacetField("d", "foo", "bar"));
writer.addDocument(config.build(doc));
if (random().nextBoolean()) {
writer.commit();
}
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("d", "foo", "baz"));
writer.addDocument(config.build(doc));
if (random().nextBoolean()) {
writer.commit();
}
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("e", "biz", "baz"));
writer.addDocument(config.build(doc));
// NRT open
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
// Per-top-reader state:
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
ExecutorService exec = randomExecutorServiceOrNull();
try {
Facets facets = getAllFacets(searcher, state, exec);
// Ask for top 10 labels for any dims that have counts:
List<FacetResult> results = facets.getAllDims(10);
assertEquals(2, results.size());
assertEquals(
"dim=d path=[] value=2 childCount=1\n foo (2)\n", results.get(0).toString());
assertEquals(
"dim=e path=[] value=1 childCount=1\n biz (1)\n", results.get(1).toString());
Collection<Accountable> resources = state.getChildResources();
assertTrue(state.toString().contains(FacetsConfig.DEFAULT_INDEX_FIELD_NAME));
if (searcher.getIndexReader().leaves().size() > 1) {
assertTrue(state.ramBytesUsed() > 0);
assertFalse(resources.isEmpty());
assertTrue(resources.toString().contains(FacetsConfig.DEFAULT_INDEX_FIELD_NAME));
} else {
assertEquals(0, state.ramBytesUsed());
assertTrue(resources.isEmpty());
}
} finally {
if (exec != null) exec.shutdownNow();
}
}
}
}
public void testSomeSegmentsMissing() throws Exception { public void testSomeSegmentsMissing() throws Exception {
try (Directory dir = newDirectory(); try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
@ -448,6 +849,58 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testHierarchicalSomeSegmentsMissing() throws Exception {
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
FacetsConfig config = new FacetsConfig();
config.setHierarchical("b", true);
Document doc = new Document();
doc.add(new SortedSetDocValuesFacetField("a", "foo1"));
doc.add(new SortedSetDocValuesFacetField("b", "foo", "bar"));
writer.addDocument(config.build(doc));
writer.commit();
doc = new Document();
writer.addDocument(config.build(doc));
writer.commit();
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("a", "foo2"));
doc.add(new SortedSetDocValuesFacetField("b", "foo", "buzz"));
writer.addDocument(config.build(doc));
writer.commit();
// NRT open
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
// Per-top-reader state:
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
ExecutorService exec = randomExecutorServiceOrNull();
try {
Facets facets = getAllFacets(searcher, state, exec);
// Ask for top 10 labels for any dims that have counts:
assertEquals(
"dim=a path=[] value=2 childCount=2\n foo1 (1)\n foo2 (1)\n",
facets.getTopChildren(10, "a").toString());
assertEquals(
"dim=b path=[] value=2 childCount=1\n foo (2)\n",
facets.getTopChildren(10, "b").toString());
assertEquals(
"dim=b path=[foo] value=2 childCount=2\n bar (1)\n buzz (1)\n",
facets.getTopChildren(10, "b", "foo").toString());
} finally {
if (exec != null) exec.shutdownNow();
}
}
}
}
public void testRandom() throws Exception { public void testRandom() throws Exception {
int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3; int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3;
for (int fullIter = 0; fullIter < fullIterations; fullIter++) { for (int fullIter = 0; fullIter < fullIterations; fullIter++) {
@ -562,6 +1015,274 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testRandomHierarchicalFlatMix() throws Exception {
int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3;
for (int fullIter = 0; fullIter < fullIterations; fullIter++) {
String[] tokens = getRandomTokens(10);
try (Directory indexDir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), indexDir)) {
FacetsConfig config = new FacetsConfig();
int numDocs = atLeast(1000);
// Most of the time allow up to 7 dims per doc, but occasionally limit all docs to a single
// dim:
int numDims;
if (random().nextInt(10) < 8) {
numDims = TestUtil.nextInt(random(), 1, 7);
} else {
numDims = 1;
}
boolean[] hierarchicalDims = new boolean[numDims];
for (int i = 0; i < numDims; i++) {
boolean isHierarchicalDim = random().nextBoolean();
config.setHierarchical("dim" + i, isHierarchicalDim);
hierarchicalDims[i] = isHierarchicalDim;
}
List<TestDoc> testDocs = getRandomDocs(tokens, numDocs, numDims);
List<Set<SortedSetDocValuesFacetField>> testDocFacets = new ArrayList<>();
for (TestDoc testDoc : testDocs) {
Document doc = new Document();
Set<SortedSetDocValuesFacetField> docFacets = new HashSet<>();
doc.add(newStringField("content", testDoc.content, Field.Store.NO));
for (int i = 0; i < numDims; i++) {
if (hierarchicalDims[i]) {
int pathLength;
if (numDims == 1) {
pathLength = 1;
} else {
pathLength = random().nextInt(numDims - 1) + 1;
}
List<String> path = new ArrayList<>();
for (int j = 0; j < pathLength; j++) {
if (testDoc.dims[j] != null) {
path.add(testDoc.dims[j]);
}
}
doc.add(new SortedSetDocValuesFacetField("dim" + i, path.toArray(String[]::new)));
for (int j = 0; j < path.size(); j++) {
docFacets.add(
new SortedSetDocValuesFacetField(
"dim" + i, path.subList(0, j + 1).toArray(String[]::new)));
}
} else if (testDoc.dims[i] != null) {
doc.add(new SortedSetDocValuesFacetField("dim" + i, testDoc.dims[i]));
docFacets.add(new SortedSetDocValuesFacetField("dim" + i, testDoc.dims[i]));
}
}
testDocFacets.add(docFacets);
w.addDocument(config.build(doc));
}
// NRT open
try (IndexReader r = w.getReader()) {
IndexSearcher searcher = newSearcher(r);
// Per-top-reader state:
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
ExecutorService exec = randomExecutorServiceOrNull();
try {
int iters = atLeast(100);
for (int iter = 0; iter < iters; iter++) {
String searchToken = tokens[random().nextInt(tokens.length)];
if (VERBOSE) {
System.out.println("\nTEST: iter content=" + searchToken);
}
FacetsCollector fc = new FacetsCollector();
FacetsCollector.search(
searcher, new TermQuery(new Term("content", searchToken)), 10, fc);
Facets facets;
if (exec != null) {
facets = new ConcurrentSortedSetDocValuesFacetCounts(state, fc, exec);
} else {
facets = new SortedSetDocValuesFacetCounts(state, fc);
}
// Slow, yet hopefully bug-free, faceting:
Map<String, FacetResult> expectedResults = new HashMap<>();
for (int i = 0; i < testDocs.size(); i++) {
TestDoc doc = testDocs.get(i);
if (doc.content.equals(searchToken)) {
// goes through all facets paths in the doc
for (SortedSetDocValuesFacetField facetField : testDocFacets.get(i)) {
String[] path = facetField.path;
String parentDimPathString;
if (path.length == 1) {
parentDimPathString = facetField.dim;
} else {
parentDimPathString =
facetField.dim
+ FacetsConfig.DELIM_CHAR
+ FacetsConfig.pathToString(path, path.length - 1);
}
FacetResult result = expectedResults.get(parentDimPathString);
if (result == null) {
String[] resultPath = new String[path.length - 1];
System.arraycopy(path, 0, resultPath, 0, resultPath.length);
result =
new FacetResult(facetField.dim, resultPath, 0, new LabelAndValue[0], 0);
}
String child = path[path.length - 1];
LabelAndValue[] labelAndValues = result.labelValues;
boolean containsChild = false;
for (int k = 0; k < labelAndValues.length; k++) {
if (labelAndValues[k].label.equals(child)) {
containsChild = true;
labelAndValues[k] =
new LabelAndValue(
labelAndValues[k].label, labelAndValues[k].value.intValue() + 1);
break;
}
}
LabelAndValue[] newLabelAndValues;
int childCount = result.childCount;
if (containsChild == false) {
newLabelAndValues = new LabelAndValue[labelAndValues.length + 1];
System.arraycopy(
labelAndValues, 0, newLabelAndValues, 0, labelAndValues.length);
newLabelAndValues[newLabelAndValues.length - 1] = new LabelAndValue(child, 1);
childCount++;
} else {
newLabelAndValues = labelAndValues;
}
newLabelAndValues =
Arrays.stream(newLabelAndValues)
.sorted(
(o1, o2) -> {
if (o1.value.equals(o2.value)) {
return new BytesRef(o1.label).compareTo(new BytesRef(o2.label));
} else {
return o2.value.intValue() - o1.value.intValue();
}
})
.collect(Collectors.toList())
.toArray(LabelAndValue[]::new);
FacetResult newResult =
new FacetResult(result.dim, result.path, 0, newLabelAndValues, childCount);
expectedResults.put(parentDimPathString, newResult);
}
}
}
// second pass to update values
for (int i = 0; i < testDocs.size(); i++) {
TestDoc doc = testDocs.get(i);
if (doc.content.equals(searchToken)) {
Set<String> dimsCounted = new HashSet<>();
for (SortedSetDocValuesFacetField facetField : testDocFacets.get(i)) {
String dimPathString =
FacetsConfig.pathToString(facetField.dim, facetField.path);
FacetResult result = expectedResults.get(dimPathString);
FacetResult dimResult = expectedResults.get(facetField.dim);
if (result != null) {
expectedResults.put(
dimPathString,
new FacetResult(
result.dim,
result.path,
result.value.intValue() + 1,
result.labelValues,
result.childCount));
}
if (dimResult != null && dimsCounted.add(facetField.dim)) {
expectedResults.put(
facetField.dim,
new FacetResult(
dimResult.dim,
dimResult.path,
dimResult.value.intValue() + 1,
dimResult.labelValues,
dimResult.childCount));
}
}
}
}
List<FacetResult> expected = new ArrayList<>(expectedResults.values());
List<FacetResult> expectedAllDims = new ArrayList<>();
for (FacetResult result : expected) {
if (result.path.length == 0) {
expectedAllDims.add(result);
if (expectedAllDims.size() >= 10) {
break;
}
}
}
sortFacetResults(expectedAllDims);
List<FacetResult> actualAllDims = facets.getAllDims(10);
assertEquals(expectedAllDims, actualAllDims);
// Dfs through top children
for (FacetResult dimResult : actualAllDims) {
if (config.getDimConfig(dimResult.dim).hierarchical) {
Stack<String[]> stack = new Stack<>();
for (LabelAndValue labelAndValue : dimResult.labelValues) {
String[] path = new String[1];
path[0] = labelAndValue.label;
stack.add(path);
}
while (stack.empty() == false) {
String[] currPath = stack.pop();
FacetResult expectedResult =
getFacetResultForPath(expected, dimResult.dim, currPath);
FacetResult actualResult = facets.getTopChildren(10, dimResult.dim, currPath);
try {
assertEquals(expectedResult, actualResult);
} catch (AssertionError e) {
System.out.println(iter);
System.out.println(config.getDimConfig(dimResult.dim).hierarchical);
throw e;
}
if (actualResult != null) {
for (LabelAndValue labelAndValue : actualResult.labelValues) {
String[] path = new String[currPath.length + 1];
System.arraycopy(currPath, 0, path, 0, currPath.length);
path[path.length - 1] = labelAndValue.label;
stack.add(path);
}
}
}
}
}
}
} finally {
if (exec != null) exec.shutdownNow();
}
}
}
}
}
private static FacetResult getFacetResultForPath(
List<FacetResult> allPaths, String dim, String[] path) {
for (FacetResult result : allPaths) {
if (path.length == 0) {
if (result.path.length == 0 && result.dim.equals(dim)) {
return result;
}
} else {
boolean isEqualPath = true;
if (path.length != result.path.length) {
isEqualPath = false;
} else {
for (int i = 0; i < path.length; i++) {
if (path[i].equals(result.path[i]) == false) {
isEqualPath = false;
break;
}
}
}
if (isEqualPath && result.dim.equals(dim)) {
return result;
}
}
}
return null;
}
public void testNonExistentDimension() throws Exception { public void testNonExistentDimension() throws Exception {
try (Directory dir = newDirectory(); try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
@ -592,6 +1313,75 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
} }
} }
public void testHierarchicalNonExistentDimension() throws Exception {
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
FacetsConfig config = new FacetsConfig();
config.setHierarchical("fizz", true);
Document doc = new Document();
doc.add(new SortedSetDocValuesFacetField("foo", "bar"));
doc.add(new SortedSetDocValuesFacetField("fizz", "buzz", "baz"));
writer.addDocument(config.build(doc));
writer.commit();
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
ExecutorService exec = randomExecutorServiceOrNull();
try {
Facets facets = getAllFacets(searcher, state, exec);
FacetResult result = facets.getTopChildren(5, "non-existent dimension");
// make sure the result is null (and no exception was thrown)
assertNull(result);
expectThrows(
IllegalArgumentException.class,
() -> {
facets.getTopChildren(5, "non-existent dimension", "with a path");
});
} finally {
if (exec != null) exec.shutdownNow();
}
}
}
}
public void testHierarchicalNonExistentPath() throws Exception {
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
FacetsConfig config = new FacetsConfig();
config.setHierarchical("fizz", true);
Document doc = new Document();
doc.add(new SortedSetDocValuesFacetField("fizz", "buzz", "baz"));
writer.addDocument(config.build(doc));
writer.commit();
try (IndexReader r = writer.getReader()) {
IndexSearcher searcher = newSearcher(r);
SortedSetDocValuesReaderState state =
new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader(), config);
ExecutorService exec = randomExecutorServiceOrNull();
try {
Facets facets = getAllFacets(searcher, state, exec);
FacetResult result = facets.getTopChildren(5, "fizz", "fake", "path");
// make sure the result is null (and no exception was thrown)
assertNull(result);
} finally {
if (exec != null) exec.shutdownNow();
}
}
}
}
private static Facets getAllFacets( private static Facets getAllFacets(
IndexSearcher searcher, SortedSetDocValuesReaderState state, ExecutorService exec) IndexSearcher searcher, SortedSetDocValuesReaderState state, ExecutorService exec)
throws IOException, InterruptedException { throws IOException, InterruptedException {

View File

@ -237,11 +237,6 @@ public class TestFacetLabel extends FacetTestCase {
() -> { () -> {
new SortedSetDocValuesFacetField("", "abc"); new SortedSetDocValuesFacetField("", "abc");
}); });
expectThrows(
IllegalArgumentException.class,
() -> {
new SortedSetDocValuesFacetField("dim", null);
});
expectThrows( expectThrows(
IllegalArgumentException.class, IllegalArgumentException.class,
() -> { () -> {