add null value index wiring for nested column to speed up is null/is not null (#15687)

Nested columns maintain a null value bitmap for which rows are nulls, however I forgot to wire up a ColumnIndexSupplier to nested columns when filtering the 'raw' data itself, so these were not able to be used. This PR fixes that by adding a supplier that can return NullValueIndex to be used by the NullFilter, which should speed up is null and is not null filters on json columns.

I haven't spent the time to measure the difference yet, but I imagine it should be a significant speed increase.

Note that I only wired this up if druid.generic.useDefaultValueForNull=false (sql compatible mode), the reason being that the SQL planner still uses selector filter, which is unable to properly handle any arrays or complex types (including json, even checking for nulls). The reason for this is so that the behavior is consistent between using the index and using the value matcher, otherwise we get into a situation where using the index has correct behavior but using the value matcher does not, which I was trying to avoid.
This commit is contained in:
Clint Wylie 2024-01-28 23:04:50 -08:00 committed by GitHub
parent 989a8f7874
commit 01fa5c7ea6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 91 additions and 1 deletions

View File

@ -26,6 +26,7 @@ import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
import org.apache.druid.segment.column.ColumnBuilder;
import org.apache.druid.segment.column.ColumnConfig;
import org.apache.druid.segment.column.ColumnIndexSupplier;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.StringEncodingStrategies;
import org.apache.druid.segment.data.BitmapSerdeFactory;
@ -35,6 +36,8 @@ import org.apache.druid.segment.data.FrontCodedIntArrayIndexed;
import org.apache.druid.segment.data.GenericIndexed;
import org.apache.druid.segment.data.Indexed;
import org.apache.druid.segment.data.VByte;
import org.apache.druid.segment.index.SimpleImmutableBitmapIndex;
import org.apache.druid.segment.index.semantic.NullValueIndex;
import org.apache.druid.segment.serde.NestedCommonFormatColumnPartSerde;
import javax.annotation.Nullable;
@ -42,7 +45,7 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
public class NestedDataColumnSupplier implements Supplier<NestedCommonFormatColumn>
public class NestedDataColumnSupplier implements Supplier<NestedCommonFormatColumn>, ColumnIndexSupplier
{
public static NestedDataColumnSupplier read(
ColumnType logicalType,
@ -242,4 +245,14 @@ public class NestedDataColumnSupplier implements Supplier<NestedCommonFormatColu
{
return simpleType == null ? ColumnType.NESTED_DATA : simpleType;
}
@Nullable
@Override
public <T> T as(Class<T> clazz)
{
if (clazz.equals(NullValueIndex.class)) {
return (T) (NullValueIndex) () -> new SimpleImmutableBitmapIndex(nullValues);
}
return null;
}
}

View File

@ -22,6 +22,7 @@ package org.apache.druid.segment.serde;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
import org.apache.druid.segment.column.ColumnBuilder;
import org.apache.druid.segment.column.ColumnCapabilitiesImpl;
@ -308,6 +309,14 @@ public class NestedCommonFormatColumnPartSerde implements ColumnPartSerde
ColumnType logicalType = simpleType == null ? ColumnType.NESTED_DATA : simpleType;
builder.setType(logicalType);
builder.setNestedCommonFormatColumnSupplier(supplier);
// in default value mode, SQL planning by default uses selector filters for things like 'is null', which does
// not work correctly for complex types (or arrays). so, only hook up this index in sql compatible mode so that
// query results are consistent when using an index or the value matcher
// additionally, nested columns only have a null value index, so we only bother with the index supplier if there
// are actually any null rows, otherwise we use the default 'no indexes' supplier
if (NullHandling.sqlCompatible() && hasNulls) {
builder.setIndexSupplier(supplier, false, false);
}
builder.setColumnFormat(new NestedCommonFormatColumn.Format(logicalType, hasNulls, enforceLogicalType));
}
}

View File

@ -917,6 +917,74 @@ public class NestedDataScanQueryTest extends InitializedNullHandlingTest
Assert.assertEquals(results.get(0).getEvents().toString(), resultsRealtime.get(0).getEvents().toString());
}
@Test
public void testIngestAndScanSegmentsNestedColumnNotNullFilter() throws Exception
{
Druids.ScanQueryBuilder builder = Druids.newScanQueryBuilder()
.dataSource("test_datasource")
.intervals(
new MultipleIntervalSegmentSpec(
Collections.singletonList(Intervals.ETERNITY)
)
)
.filters(NotDimFilter.of(NullFilter.forColumn("complexObj")))
.columns("complexObj")
.resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_COMPACTED_LIST)
.limit(100)
.context(ImmutableMap.of());
Query<ScanResultValue> scanQuery = builder.build();
final AggregatorFactory[] aggs = new AggregatorFactory[]{new CountAggregatorFactory("count")};
List<Segment> realtimeSegs = ImmutableList.of(
NestedDataTestUtils.createIncrementalIndex(
tempFolder,
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
NestedDataTestUtils.DEFAULT_JSON_INPUT_FORMAT,
NestedDataTestUtils.TIMESTAMP_SPEC,
NestedDataTestUtils.AUTO_DISCOVERY,
TransformSpec.NONE,
aggs,
Granularities.NONE,
true
)
);
List<Segment> segs = NestedDataTestUtils.createSegments(
tempFolder,
closer,
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
NestedDataTestUtils.DEFAULT_JSON_INPUT_FORMAT,
NestedDataTestUtils.TIMESTAMP_SPEC,
NestedDataTestUtils.AUTO_DISCOVERY,
TransformSpec.NONE,
aggs,
Granularities.NONE,
true,
IndexSpec.DEFAULT
);
final Sequence<ScanResultValue> seq = helper.runQueryOnSegmentsObjs(realtimeSegs, scanQuery);
final Sequence<ScanResultValue> seq2 = helper.runQueryOnSegmentsObjs(segs, scanQuery);
List<ScanResultValue> resultsRealtime = seq.toList();
List<ScanResultValue> resultsSegments = seq2.toList();
logResults(resultsSegments);
logResults(resultsRealtime);
Assert.assertEquals(1, resultsRealtime.size());
Assert.assertEquals(resultsRealtime.size(), resultsSegments.size());
if (NullHandling.replaceWithDefault()) {
Assert.assertEquals(
"[[{x=400, y=[{l=[null], m=100, n=5}, {l=[a, b, c], m=a, n=1}], z={}}], [{x=10, y=[{l=[b, b, c], m=b, n=2}, [1, 2, 3]], z={a=[5.5], b=false}}], [{x=1234, y=[{l=[a, b, c], m=a, n=1}, {l=[a, b, c], m=a, n=1}], z={a=[1.1, 2.2, 3.3], b=true}}], [{x=1234, z={a=[1.1, 2.2, 3.3], b=true}}], [{x=11, y=[], z={a=[null], b=false}}], [{x=4.4, y=[{l=[], m=100, n=3}, {l=[a]}, {l=[b], n=[]}], z={a=[], b=true}}]]",
resultsSegments.get(0).getEvents().toString()
);
} else {
Assert.assertEquals(
"[[{x=400, y=[{l=[null], m=100, n=5}, {l=[a, b, c], m=a, n=1}], z={}}], [{x=10, y=[{l=[b, b, c], m=b, n=2}, [1, 2, 3]], z={a=[5.5], b=false}}], [{x=1234, y=[{l=[a, b, c], m=a, n=1}, {l=[a, b, c], m=a, n=1}], z={a=[1.1, 2.2, 3.3], b=true}}], [{x=1234, z={a=[1.1, 2.2, 3.3], b=true}}], [{x=11, y=[], z={a=[null], b=false}}], [{x=4.4, y=[{l=[], m=100, n=3}, {l=[a]}, {l=[b], n=[]}], z={a=[], b=true}}]]",
resultsSegments.get(0).getEvents().toString()
);
}
Assert.assertEquals(resultsSegments.get(0).getEvents().toString(), resultsRealtime.get(0).getEvents().toString());
}
private static void logResults(List<ScanResultValue> results)
{
StringBuilder bob = new StringBuilder();