mirror of https://github.com/apache/lucene.git
LUCENE-9952: Fix dim count inaccuracies in SSDV faceting when a dim is multi-valued (#611)
This commit is contained in:
parent
10ca531ddc
commit
9e560c1af1
|
@ -64,7 +64,7 @@ API Changes
|
|||
* LUCENE-10349: WordListLoader methods now return unmodifiable CharArraySets. (Uwe Schindler)
|
||||
|
||||
* LUCENE-10377: SortField.getComparator() has changed signature. The second parameter is now
|
||||
a boolean indicating whether or not skipping should be enabled on the comparator.
|
||||
a boolean indicating whether or not skipping should be enabled on the comparator.
|
||||
(Alan Woodward)
|
||||
|
||||
* LUCENE-10381: Require users to provide FacetsConfig for SSDV faceting. (Greg Miller)
|
||||
|
@ -193,12 +193,15 @@ Bug Fixes
|
|||
* LUCENE-10352: Fixed ctor argument checks: JapaneseKatakanaStemFilter,
|
||||
DoubleMetaphoneFilter (Uwe Schindler, Robert Muir)
|
||||
|
||||
* LLUCENE-10353: Add random null injection to TestRandomChains. (Robert Muir,
|
||||
* LUCENE-10353: Add random null injection to TestRandomChains. (Robert Muir,
|
||||
Uwe Schindler)
|
||||
|
||||
* LUCENE-10377: CheckIndex could incorrectly throw an error when checking index sorts
|
||||
defined on older indexes. (Alan Woodward)
|
||||
|
||||
* LUCENE-9952: Address inaccurate dim counts for SSDV faceting in cases where a dim is configured
|
||||
as multi-valued. (Greg Miller)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -479,10 +479,14 @@ public class FacetsConfig {
|
|||
for (SortedSetDocValuesFacetField facetField : ent.getValue()) {
|
||||
FacetLabel facetLabel = new FacetLabel(facetField.dim, facetField.path);
|
||||
DimConfig dimConfig = getDimConfig(facetField.dim);
|
||||
|
||||
// For facet counts:
|
||||
if (dimConfig.hierarchical) {
|
||||
// Index each member of the path explicitly. This is required for hierarchical counting
|
||||
// to work properly since we need to ensure every unique path has a corresponding ordinal
|
||||
// in the SSDV field:
|
||||
for (int i = 0; i < facetLabel.length; i++) {
|
||||
String fullPath = pathToString(facetLabel.components, i + 1);
|
||||
// For facet counts:
|
||||
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
|
||||
}
|
||||
} else {
|
||||
|
@ -494,10 +498,15 @@ public class FacetsConfig {
|
|||
+ facetField.path.length
|
||||
+ " components");
|
||||
}
|
||||
if (dimConfig.multiValued && dimConfig.requireDimCount) {
|
||||
// If non-hierarchical but multi-valued and requiring dim count, make sure to
|
||||
// explicitly index the dimension so we can get accurate counts for it:
|
||||
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(facetField.dim)));
|
||||
}
|
||||
String fullPath = pathToString(facetLabel.components, facetLabel.length);
|
||||
// For facet counts:
|
||||
doc.add(new SortedSetDocValuesField(indexFieldName, new BytesRef(fullPath)));
|
||||
}
|
||||
|
||||
// For drill-down:
|
||||
indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel);
|
||||
}
|
||||
|
|
|
@ -102,14 +102,16 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
|
|||
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
|
||||
}
|
||||
|
||||
if (stateConfig.getDimConfig(dim).hierarchical) {
|
||||
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
|
||||
|
||||
if (dimConfig.hierarchical) {
|
||||
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
|
||||
if (pathOrd < 0) {
|
||||
// path was never indexed
|
||||
return null;
|
||||
}
|
||||
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
|
||||
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
|
||||
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
|
||||
} else {
|
||||
if (path.length > 0) {
|
||||
throw new IllegalArgumentException(
|
||||
|
@ -120,12 +122,25 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
|
|||
// means dimension was never indexed
|
||||
return null;
|
||||
}
|
||||
return getDim(dim, null, -1, ordRange.iterator(), topN);
|
||||
int dimOrd = ordRange.start;
|
||||
PrimitiveIterator.OfInt childIt = ordRange.iterator();
|
||||
if (dimConfig.multiValued && dimConfig.requireDimCount) {
|
||||
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
|
||||
// the dimension and we need to skip past it so the iterator is positioned on the first
|
||||
// child:
|
||||
childIt.next();
|
||||
}
|
||||
return getPathResult(dimConfig, dim, null, -dimOrd, childIt, topN);
|
||||
}
|
||||
}
|
||||
|
||||
private FacetResult getDim(
|
||||
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
|
||||
private FacetResult getPathResult(
|
||||
FacetsConfig.DimConfig dimConfig,
|
||||
String dim,
|
||||
String[] path,
|
||||
int pathOrd,
|
||||
PrimitiveIterator.OfInt childOrds,
|
||||
int topN)
|
||||
throws IOException {
|
||||
|
||||
TopOrdAndIntQueue q = null;
|
||||
|
@ -174,11 +189,17 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
|
|||
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
|
||||
}
|
||||
|
||||
if (pathOrd == -1) {
|
||||
// not hierarchical facet
|
||||
if (dimConfig.hierarchical == false) {
|
||||
// see if dimCount is actually reliable or needs to be reset
|
||||
if (dimConfig.multiValued) {
|
||||
if (dimConfig.requireDimCount) {
|
||||
dimCount = counts.get(pathOrd);
|
||||
} else {
|
||||
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
|
||||
}
|
||||
}
|
||||
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
|
||||
} else {
|
||||
// hierarchical facet
|
||||
return new FacetResult(dim, path, counts.get(pathOrd), labelValues, childCount);
|
||||
}
|
||||
}
|
||||
|
@ -393,15 +414,25 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
|
|||
|
||||
List<FacetResult> results = new ArrayList<>();
|
||||
for (String dim : state.getDims()) {
|
||||
if (stateConfig.getDimConfig(dim).hierarchical) {
|
||||
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
|
||||
if (dimConfig.hierarchical) {
|
||||
SortedSetDocValuesReaderState.DimTree dimTree = state.getDimTree(dim);
|
||||
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
|
||||
int dimOrd = dimTree.dimStartOrd;
|
||||
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN);
|
||||
if (fr != null) {
|
||||
results.add(fr);
|
||||
}
|
||||
} else {
|
||||
OrdRange ordRange = state.getOrdRange(dim);
|
||||
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
|
||||
int dimOrd = ordRange.start;
|
||||
PrimitiveIterator.OfInt childIt = ordRange.iterator();
|
||||
if (dimConfig.multiValued && dimConfig.requireDimCount) {
|
||||
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
|
||||
// the dimension and we need to skip past it so the iterator is positioned on the first
|
||||
// child:
|
||||
childIt.next();
|
||||
}
|
||||
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN);
|
||||
if (fr != null) {
|
||||
results.add(fr);
|
||||
}
|
||||
|
|
|
@ -189,7 +189,9 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
|
|||
|
||||
BytesRef nextTerm = dv.lookupOrd(dimEndOrd);
|
||||
String[] nextComponents = FacetsConfig.stringToPath(nextTerm.utf8ToString());
|
||||
if (nextComponents.length != 2) {
|
||||
// The first entry should always be length 1 or 2 (either just the dim itself if we explicitly
|
||||
// indexed it, or the first child):
|
||||
if (nextComponents.length > 2) {
|
||||
throw new IllegalArgumentException(
|
||||
"dimension not configured to handle hierarchical field; got: "
|
||||
+ Arrays.toString(nextComponents)
|
||||
|
@ -212,6 +214,7 @@ public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesRead
|
|||
break;
|
||||
}
|
||||
|
||||
// Each entry should have a length of exactly 2 since the dim is non-hierarchical:
|
||||
if (nextComponents.length != 2) {
|
||||
throw new IllegalArgumentException(
|
||||
"dimension not configured to handle hierarchical field; got: "
|
||||
|
|
|
@ -60,7 +60,11 @@ import org.apache.lucene.util.LongValues;
|
|||
* <p><b>NOTE</b>: this class should be instantiated and then used from a single thread, because it
|
||||
* holds a thread-private instance of {@link SortedSetDocValues}.
|
||||
*
|
||||
* <p><b>NOTE:</b>: tie-break is by unicode sort order
|
||||
* <p><b>NOTE</b>: tie-break is by unicode sort order
|
||||
*
|
||||
* <p><b>NOTE</b>: if you have multi-valued dims that require dim counts (see {@link FacetsConfig},
|
||||
* make sure to provide your {@code FacetsConfig} instance when instantiating {@link
|
||||
* SortedSetDocValuesReaderState}, or else dim counts can be inaccurate
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -101,14 +105,16 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
|
||||
}
|
||||
|
||||
if (stateConfig.getDimConfig(dim).hierarchical) {
|
||||
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
|
||||
|
||||
if (dimConfig.hierarchical) {
|
||||
int pathOrd = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
|
||||
if (pathOrd < 0) {
|
||||
// path was never indexed
|
||||
return null;
|
||||
}
|
||||
DimTree dimTree = state.getDimTree(dim);
|
||||
return getDim(dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
|
||||
return getPathResult(dimConfig, dim, path, pathOrd, dimTree.iterator(pathOrd), topN);
|
||||
} else {
|
||||
if (path.length > 0) {
|
||||
throw new IllegalArgumentException(
|
||||
|
@ -119,12 +125,25 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
// means dimension was never indexed
|
||||
return null;
|
||||
}
|
||||
return getDim(dim, null, -1, ordRange.iterator(), topN);
|
||||
int dimOrd = ordRange.start;
|
||||
PrimitiveIterator.OfInt childIt = ordRange.iterator();
|
||||
if (dimConfig.multiValued && dimConfig.requireDimCount) {
|
||||
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
|
||||
// the dimension and we need to skip past it so the iterator is positioned on the first
|
||||
// child:
|
||||
childIt.next();
|
||||
}
|
||||
return getPathResult(dimConfig, dim, null, dimOrd, childIt, topN);
|
||||
}
|
||||
}
|
||||
|
||||
private FacetResult getDim(
|
||||
String dim, String[] path, int pathOrd, PrimitiveIterator.OfInt childOrds, int topN)
|
||||
private FacetResult getPathResult(
|
||||
FacetsConfig.DimConfig dimConfig,
|
||||
String dim,
|
||||
String[] path,
|
||||
int pathOrd,
|
||||
PrimitiveIterator.OfInt childOrds,
|
||||
int topN)
|
||||
throws IOException {
|
||||
|
||||
TopOrdAndIntQueue q = null;
|
||||
|
@ -172,11 +191,17 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
labelValues[i] = new LabelAndValue(parts[parts.length - 1], ordAndValue.value);
|
||||
}
|
||||
|
||||
if (pathOrd == -1) {
|
||||
// not hierarchical facet
|
||||
if (dimConfig.hierarchical == false) {
|
||||
// see if dimCount is actually reliable or needs to be reset
|
||||
if (dimConfig.multiValued) {
|
||||
if (dimConfig.requireDimCount) {
|
||||
dimCount = counts[pathOrd];
|
||||
} else {
|
||||
dimCount = -1; // dimCount is in accurate at this point, so set it to -1
|
||||
}
|
||||
}
|
||||
return new FacetResult(dim, emptyPath, dimCount, labelValues, childCount);
|
||||
} else {
|
||||
// hierarchical facet
|
||||
return new FacetResult(dim, path, counts[pathOrd], labelValues, childCount);
|
||||
}
|
||||
}
|
||||
|
@ -346,15 +371,25 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
|
||||
List<FacetResult> results = new ArrayList<>();
|
||||
for (String dim : state.getDims()) {
|
||||
if (stateConfig.getDimConfig(dim).hierarchical) {
|
||||
FacetsConfig.DimConfig dimConfig = stateConfig.getDimConfig(dim);
|
||||
if (dimConfig.hierarchical) {
|
||||
DimTree dimTree = state.getDimTree(dim);
|
||||
FacetResult fr = getDim(dim, emptyPath, dimTree.dimStartOrd, dimTree.iterator(), topN);
|
||||
int dimOrd = dimTree.dimStartOrd;
|
||||
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, dimTree.iterator(), topN);
|
||||
if (fr != null) {
|
||||
results.add(fr);
|
||||
}
|
||||
} else {
|
||||
OrdRange ordRange = state.getOrdRange(dim);
|
||||
FacetResult fr = getDim(dim, emptyPath, -1, ordRange.iterator(), topN);
|
||||
int dimOrd = ordRange.start;
|
||||
PrimitiveIterator.OfInt childIt = ordRange.iterator();
|
||||
if (dimConfig.multiValued && dimConfig.requireDimCount) {
|
||||
// If the dim is multi-valued and requires dim counts, we know we've explicitly indexed
|
||||
// the dimension and we need to skip past it so the iterator is positioned on the first
|
||||
// child:
|
||||
childIt.next();
|
||||
}
|
||||
FacetResult fr = getPathResult(dimConfig, dim, emptyPath, dimOrd, childIt, topN);
|
||||
if (fr != null) {
|
||||
results.add(fr);
|
||||
}
|
||||
|
|
|
@ -63,6 +63,8 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
public void testBasic() throws Exception {
|
||||
FacetsConfig config = new FacetsConfig();
|
||||
config.setMultiValued("a", true);
|
||||
config.setMultiValued("b", true);
|
||||
config.setRequireDimCount("b", true);
|
||||
try (Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
|
||||
Document doc = new Document();
|
||||
|
@ -70,6 +72,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
doc.add(new SortedSetDocValuesFacetField("a", "bar"));
|
||||
doc.add(new SortedSetDocValuesFacetField("a", "zoo"));
|
||||
doc.add(new SortedSetDocValuesFacetField("b", "baz"));
|
||||
doc.add(new SortedSetDocValuesFacetField("b", "buzz"));
|
||||
writer.addDocument(config.build(doc));
|
||||
if (random().nextBoolean()) {
|
||||
writer.commit();
|
||||
|
@ -77,6 +80,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
|
||||
doc = new Document();
|
||||
doc.add(new SortedSetDocValuesFacetField("a", "foo"));
|
||||
doc.add(new SortedSetDocValuesFacetField("b", "buzz"));
|
||||
writer.addDocument(config.build(doc));
|
||||
|
||||
// NRT open
|
||||
|
@ -91,12 +95,13 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
try {
|
||||
Facets facets = getAllFacets(searcher, state, exec);
|
||||
|
||||
// value should ideally be 2 but SSDV facets are bugged here
|
||||
// value for dim a should be -1 since it's multivalued but doesn't require dim counts:
|
||||
assertEquals(
|
||||
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
|
||||
"dim=a path=[] value=-1 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
|
||||
facets.getTopChildren(10, "a").toString());
|
||||
// value for dim b should be 2 since it's multivalued but _does_ require dim counts:
|
||||
assertEquals(
|
||||
"dim=b path=[] value=1 childCount=1\n baz (1)\n",
|
||||
"dim=b path=[] value=2 childCount=2\n buzz (2)\n baz (1)\n",
|
||||
facets.getTopChildren(10, "b").toString());
|
||||
|
||||
// DrillDown:
|
||||
|
@ -115,6 +120,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
public void testBasicHierarchical() throws Exception {
|
||||
FacetsConfig config = new FacetsConfig();
|
||||
config.setMultiValued("a", true);
|
||||
config.setRequireDimCount("a", true);
|
||||
config.setMultiValued("c", true);
|
||||
config.setHierarchical("c", true);
|
||||
try (Directory dir = newDirectory();
|
||||
|
@ -152,10 +158,10 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
try {
|
||||
Facets facets = getAllFacets(searcher, state, exec);
|
||||
|
||||
// since a is not set to be hierarchical, it's value count will be bugged as ancestral
|
||||
// paths are not indexed
|
||||
// since a is not set to be hierarchical but _is_ multi-valued, we expect a value of 2
|
||||
// (since two unique docs contain at least one value for this dim):
|
||||
assertEquals(
|
||||
"dim=a path=[] value=4 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
|
||||
"dim=a path=[] value=2 childCount=3\n foo (2)\n bar (1)\n zoo (1)\n",
|
||||
facets.getTopChildren(10, "a").toString());
|
||||
assertEquals(
|
||||
"dim=b path=[] value=1 childCount=1\n baz (1)\n",
|
||||
|
|
Loading…
Reference in New Issue