LUCENE-9952: Fix dim count inaccuracies in SSDV faceting when a dim is multi-valued (#611)

This commit is contained in:
Greg Miller 2022-01-24 06:48:20 -08:00 committed by GitHub
parent 10ca531ddc
commit 9e560c1af1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 121 additions and 34 deletions

View File

@ -64,7 +64,7 @@ API Changes
* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets. (Uwe Schindler)
* LUCENE-10377: SortField.getComparator() has changed signature. The second parameter is now
a boolean indicating whether or not skipping should be enabled on the comparator.
a boolean indicating whether or not skipping should be enabled on the comparator.
(Alan Woodward)
* LUCENE-10381: Require users to provide FacetsConfig for SSDV faceting. (Greg Miller)
@ -193,12 +193,15 @@ Bug Fixes
* LUCENE-10352: Fixed ctor argument checks: JapaneseKatakanaStemFilter,
DoubleMetaphoneFilter (Uwe Schindler, Robert Muir)
* LLUCENE-10353: Add random null injection to TestRandomChains. (Robert Muir,
* LUCENE-10353: Add random null injection to TestRandomChains. (Robert Muir,
Uwe Schindler)
* LUCENE-10377: CheckIndex could incorrectly throw an error when checking index sorts
defined on older indexes. (Alan Woodward)
* LUCENE-9952: Address inaccurate dim counts for SSDV faceting in cases where a dim is configured
as multi-valued. (Greg Miller)
Other
---------------------

View File

@ -479,10 +479,14 @@ public class FacetsConfig {
for (SortedSetDocValuesFacetField facetField : ent.getValue()) {
FacetLabel facetLabel = new FacetLabel(facetField.dim, facetField.path);
DimConfig dimConfig = getDimConfig(facetField.dim);
// For facet counts:
if (dimConfig.hierarchical) {
// Index each member of the path explicitly. This is required for hierarchical counting
// to work properly since we need to ensure every unique path has a corresponding ordinal
// in the SSDV field:
for (int i = 0; i < facetLabel.length; i++) {
String fullPath = pathToString(facetLabel.components, i + 1);
// For facet counts:
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
}
} else {
@ -494,10 +498,15 @@ public class FacetsConfig {
+ facetField.path.length
+ " components");
}
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If non-hierarchical but multi-valued and requiring dim count, make sure to
// explicitly index the dimension so we can get accurate counts for it:
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(facetField.dim)));
}
String fullPath = pathToString(facetLabel.components, facetLabel.length);
// For facet counts:
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
}
// For drill-down:
indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel);
}

View File

@ -102,14 +102,16 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
}
if (stateConfig.getDimConfig(dim).hierarchical) {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (pathOrd < 0) {
// path was never indexed
return null;
}
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
} else {
if (path.length > 0) {
throw new IllegalArgumentException(
@ -120,12 +122,25 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
// means dimension was never indexed
return null;
}
return getDim(dim, null, -1, ordRange.iterator(), topN);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
return getPathResult(dimConfig, dim, null, -dimOrd, childIt, topN);
}
}
private FacetResult getDim(
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
String[] path,
int pathOrd,
PrimitiveIterator.OfInt childOrds,
int topN)
throws IOException {
TopOrdAndIntQueue q = null;
@ -174,11 +189,17 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
}
if (pathOrd == -1) {
// not hierarchical facet
if (dimConfig.hierarchical == false) {
// see if dimCount is actually reliable or needs to be reset
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
dimCount = counts.get(pathOrd);
} else {
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
}
}
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
} else {
// hierarchical facet
return new FacetResult(dim, path, counts.get(pathOrd), labelValues, childCount);
}
}
@ -393,15 +414,25 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
List<FacetResult> results = new ArrayList<>();
for (String dim : state.getDims()) {
if (stateConfig.getDimConfig(dim).hierarchical) {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
int dimOrd = dimTree.dimStartOrd;
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN);
if (fr != null) {
results.add(fr);
}
} else {
OrdRange ordRange = state.getOrdRange(dim);
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN);
if (fr != null) {
results.add(fr);
}

View File

@ -189,7 +189,9 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
BytesRef nextTerm = dv.lookupOrd(dimEndOrd);
String[] nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
if (nextComponents.length != 2) {
// The first entry should always be length 1 or 2 (either just the dim itself if we explicitly
// indexed it, or the first child):
if (nextComponents.length > 2) {
throw new IllegalArgumentException(
"dimension not configured to handle hierarchical field; got: "
+ Arrays.toString(nextComponents)
@ -212,6 +214,7 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
break;
}
// Each entry should have a length of exactly 2 since the dim is non-hierarchical:
if (nextComponents.length != 2) {
throw new IllegalArgumentException(
"dimension not configured to handle hierarchical field; got: "

View File

@ -60,7 +60,11 @@ import org.apache.lucene.util.LongValues;
* <p><b>NOTE</b>: this class should be instantiated and then used from a single thread, because it
* holds a thread-private instance of {@link SortedSetDocValues}.
*
* <p><b>NOTE:</b>: tie-break is by unicode sort order
* <p><b>NOTE</b>: tie-break is by unicode sort order
*
* <p><b>NOTE</b>: if you have multi-valued dims that require dim counts (see {@link FacetsConfig},
* make sure to provide your {@code FacetsConfig} instance when instantiating {@link
* SortedSetDocValuesReaderState}, or else dim counts can be inaccurate
*
* @lucene.experimental
*/
@ -101,14 +105,16 @@ public class SortedSetDocValuesFacetCounts extends Facets {
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
}
if (stateConfig.getDimConfig(dim).hierarchical) {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
if (pathOrd < 0) {
// path was never indexed
return null;
}
DimTree dimTree = state.getDimTree(dim);
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
} else {
if (path.length > 0) {
throw new IllegalArgumentException(
@ -119,12 +125,25 @@ public class SortedSetDocValuesFacetCounts extends Facets {
// means dimension was never indexed
return null;
}
return getDim(dim, null, -1, ordRange.iterator(), topN);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN);
}
}
private FacetResult getDim(
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
private FacetResult getPathResult(
FacetsConfig.DimConfig dimConfig,
String dim,
String[] path,
int pathOrd,
PrimitiveIterator.OfInt childOrds,
int topN)
throws IOException {
TopOrdAndIntQueue q = null;
@ -172,11 +191,17 @@ public class SortedSetDocValuesFacetCounts extends Facets {
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
}
if (pathOrd == -1) {
// not hierarchical facet
if (dimConfig.hierarchical == false) {
// see if dimCount is actually reliable or needs to be reset
if (dimConfig.multiValued) {
if (dimConfig.requireDimCount) {
dimCount = counts[pathOrd];
} else {
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
}
}
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
} else {
// hierarchical facet
return new FacetResult(dim, path, counts[pathOrd], labelValues, childCount);
}
}
@ -346,15 +371,25 @@ public class SortedSetDocValuesFacetCounts extends Facets {
List<FacetResult> results = new ArrayList<>();
for (String dim : state.getDims()) {
if (stateConfig.getDimConfig(dim).hierarchical) {
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
if (dimConfig.hierarchical) {
DimTree dimTree = state.getDimTree(dim);
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
int dimOrd = dimTree.dimStartOrd;
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN);
if (fr != null) {
results.add(fr);
}
} else {
OrdRange ordRange = state.getOrdRange(dim);
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
int dimOrd = ordRange.start;
PrimitiveIterator.OfInt childIt = ordRange.iterator();
if (dimConfig.multiValued && dimConfig.requireDimCount) {
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
// the dimension and we need to skip past it so the iterator is positioned on the first
// child:
childIt.next();
}
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN);
if (fr != null) {
results.add(fr);
}

View File

@ -63,6 +63,8 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
public void testBasic() throws Exception {
FacetsConfig config = new FacetsConfig();
config.setMultiValued("a", true);
config.setMultiValued("b", true);
config.setRequireDimCount("b", true);
try (Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
Document doc = new Document();
@ -70,6 +72,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
doc.add(new SortedSetDocValuesFacetField("a", "bar"));
doc.add(new SortedSetDocValuesFacetField("a", "zoo"));
doc.add(new SortedSetDocValuesFacetField("b", "baz"));
doc.add(new SortedSetDocValuesFacetField("b", "buzz"));
writer.addDocument(config.build(doc));
if (random().nextBoolean()) {
writer.commit();
@ -77,6 +80,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
doc = new Document();
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
doc.add(new SortedSetDocValuesFacetField("b", "buzz"));
writer.addDocument(config.build(doc));
// NRT open
@ -91,12 +95,13 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
try {
Facets facets = getAllFacets(searcher, state, exec);
// value should ideally be 2 but SSDV facets are bugged here
// value for dim a should be -1 since it's multivalued but doesn't require dim counts:
assertEquals(
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
"dim=a path=[] value=-1 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
facets.getTopChildren(10, "a").toString());
// value for dim b should be 2 since it's multivalued but _does_ require dim counts:
assertEquals(
"dim=b path=[] value=1 childCount=1\n baz (1)\n",
"dim=b path=[] value=2 childCount=2\n buzz (2)\n baz (1)\n",
facets.getTopChildren(10, "b").toString());
// DrillDown:
@ -115,6 +120,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
public void testBasicHierarchical() throws Exception {
FacetsConfig config = new FacetsConfig();
config.setMultiValued("a", true);
config.setRequireDimCount("a", true);
config.setMultiValued("c", true);
config.setHierarchical("c", true);
try (Directory dir = newDirectory();
@ -152,10 +158,10 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
try {
Facets facets = getAllFacets(searcher, state, exec);
// since a is not set to be hierarchical, it's value count will be bugged as ancestral
// paths are not indexed
// since a is not set to be hierarchical but _is_ multi-valued, we expect a value of 2
// (since two unique docs contain at least one value for this dim):
assertEquals(
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
"dim=a path=[] value=2 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
facets.getTopChildren(10, "a").toString());
assertEquals(
"dim=b path=[] value=1 childCount=1\n baz (1)\n",