mirror of https://github.com/apache/druid.git
add null value index wiring for nested column to speed up is null/is not null (#15687)
Nested columns maintain a null value bitmap for which rows are nulls, however I forgot to wire up a ColumnIndexSupplier to nested columns when filtering the 'raw' data itself, so these were not able to be used. This PR fixes that by adding a supplier that can return NullValueIndex to be used by the NullFilter, which should speed up is null and is not null filters on json columns. I haven't spent the time to measure the difference yet, but I imagine it should be a significant speed increase. Note that I only wired this up if druid.generic.useDefaultValueForNull=false (sql compatible mode), the reason being that the SQL planner still uses selector filter, which is unable to properly handle any arrays or complex types (including json, even checking for nulls). The reason for this is so that the behavior is consistent between using the index and using the value matcher, otherwise we get into a situation where using the index has correct behavior but using the value matcher does not, which I was trying to avoid.
This commit is contained in:
parent
989a8f7874
commit
01fa5c7ea6
|
@ -26,6 +26,7 @@ import org.apache.druid.java.util.common.StringUtils;
|
|||
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
|
||||
import org.apache.druid.segment.column.ColumnBuilder;
|
||||
import org.apache.druid.segment.column.ColumnConfig;
|
||||
import org.apache.druid.segment.column.ColumnIndexSupplier;
|
||||
import org.apache.druid.segment.column.ColumnType;
|
||||
import org.apache.druid.segment.column.StringEncodingStrategies;
|
||||
import org.apache.druid.segment.data.BitmapSerdeFactory;
|
||||
|
@ -35,6 +36,8 @@ import org.apache.druid.segment.data.FrontCodedIntArrayIndexed;
|
|||
import org.apache.druid.segment.data.GenericIndexed;
|
||||
import org.apache.druid.segment.data.Indexed;
|
||||
import org.apache.druid.segment.data.VByte;
|
||||
import org.apache.druid.segment.index.SimpleImmutableBitmapIndex;
|
||||
import org.apache.druid.segment.index.semantic.NullValueIndex;
|
||||
import org.apache.druid.segment.serde.NestedCommonFormatColumnPartSerde;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
@ -42,7 +45,7 @@ import java.io.IOException;
|
|||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
public class NestedDataColumnSupplier implements Supplier<NestedCommonFormatColumn>
|
||||
public class NestedDataColumnSupplier implements Supplier<NestedCommonFormatColumn>, ColumnIndexSupplier
|
||||
{
|
||||
public static NestedDataColumnSupplier read(
|
||||
ColumnType logicalType,
|
||||
|
@ -242,4 +245,14 @@ public class NestedDataColumnSupplier implements Supplier<NestedCommonFormatColu
|
|||
{
|
||||
return simpleType == null ? ColumnType.NESTED_DATA : simpleType;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
@Override
|
||||
public <T> T as(Class<T> clazz)
|
||||
{
|
||||
if (clazz.equals(NullValueIndex.class)) {
|
||||
return (T) (NullValueIndex) () -> new SimpleImmutableBitmapIndex(nullValues);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.apache.druid.segment.serde;
|
|||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import org.apache.druid.common.config.NullHandling;
|
||||
import org.apache.druid.java.util.common.io.smoosh.SmooshedFileMapper;
|
||||
import org.apache.druid.segment.column.ColumnBuilder;
|
||||
import org.apache.druid.segment.column.ColumnCapabilitiesImpl;
|
||||
|
@ -308,6 +309,14 @@ public class NestedCommonFormatColumnPartSerde implements ColumnPartSerde
|
|||
ColumnType logicalType = simpleType == null ? ColumnType.NESTED_DATA : simpleType;
|
||||
builder.setType(logicalType);
|
||||
builder.setNestedCommonFormatColumnSupplier(supplier);
|
||||
// in default value mode, SQL planning by default uses selector filters for things like 'is null', which does
|
||||
// not work correctly for complex types (or arrays). so, only hook up this index in sql compatible mode so that
|
||||
// query results are consistent when using an index or the value matcher
|
||||
// additionally, nested columns only have a null value index, so we only bother with the index supplier if there
|
||||
// are actually any null rows, otherwise we use the default 'no indexes' supplier
|
||||
if (NullHandling.sqlCompatible() && hasNulls) {
|
||||
builder.setIndexSupplier(supplier, false, false);
|
||||
}
|
||||
builder.setColumnFormat(new NestedCommonFormatColumn.Format(logicalType, hasNulls, enforceLogicalType));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -917,6 +917,74 @@ public class NestedDataScanQueryTest extends InitializedNullHandlingTest
|
|||
Assert.assertEquals(results.get(0).getEvents().toString(), resultsRealtime.get(0).getEvents().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIngestAndScanSegmentsNestedColumnNotNullFilter() throws Exception
|
||||
{
|
||||
Druids.ScanQueryBuilder builder = Druids.newScanQueryBuilder()
|
||||
.dataSource("test_datasource")
|
||||
.intervals(
|
||||
new MultipleIntervalSegmentSpec(
|
||||
Collections.singletonList(Intervals.ETERNITY)
|
||||
)
|
||||
)
|
||||
.filters(NotDimFilter.of(NullFilter.forColumn("complexObj")))
|
||||
.columns("complexObj")
|
||||
.resultFormat(ScanQuery.ResultFormat.RESULT_FORMAT_COMPACTED_LIST)
|
||||
.limit(100)
|
||||
.context(ImmutableMap.of());
|
||||
Query<ScanResultValue> scanQuery = builder.build();
|
||||
final AggregatorFactory[] aggs = new AggregatorFactory[]{new CountAggregatorFactory("count")};
|
||||
List<Segment> realtimeSegs = ImmutableList.of(
|
||||
NestedDataTestUtils.createIncrementalIndex(
|
||||
tempFolder,
|
||||
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
|
||||
NestedDataTestUtils.DEFAULT_JSON_INPUT_FORMAT,
|
||||
NestedDataTestUtils.TIMESTAMP_SPEC,
|
||||
NestedDataTestUtils.AUTO_DISCOVERY,
|
||||
TransformSpec.NONE,
|
||||
aggs,
|
||||
Granularities.NONE,
|
||||
true
|
||||
)
|
||||
);
|
||||
List<Segment> segs = NestedDataTestUtils.createSegments(
|
||||
tempFolder,
|
||||
closer,
|
||||
NestedDataTestUtils.ALL_TYPES_TEST_DATA_FILE,
|
||||
NestedDataTestUtils.DEFAULT_JSON_INPUT_FORMAT,
|
||||
NestedDataTestUtils.TIMESTAMP_SPEC,
|
||||
NestedDataTestUtils.AUTO_DISCOVERY,
|
||||
TransformSpec.NONE,
|
||||
aggs,
|
||||
Granularities.NONE,
|
||||
true,
|
||||
IndexSpec.DEFAULT
|
||||
);
|
||||
|
||||
|
||||
final Sequence<ScanResultValue> seq = helper.runQueryOnSegmentsObjs(realtimeSegs, scanQuery);
|
||||
final Sequence<ScanResultValue> seq2 = helper.runQueryOnSegmentsObjs(segs, scanQuery);
|
||||
|
||||
List<ScanResultValue> resultsRealtime = seq.toList();
|
||||
List<ScanResultValue> resultsSegments = seq2.toList();
|
||||
logResults(resultsSegments);
|
||||
logResults(resultsRealtime);
|
||||
Assert.assertEquals(1, resultsRealtime.size());
|
||||
Assert.assertEquals(resultsRealtime.size(), resultsSegments.size());
|
||||
if (NullHandling.replaceWithDefault()) {
|
||||
Assert.assertEquals(
|
||||
"[[{x=400, y=[{l=[null], m=100, n=5}, {l=[a, b, c], m=a, n=1}], z={}}], [{x=10, y=[{l=[b, b, c], m=b, n=2}, [1, 2, 3]], z={a=[5.5], b=false}}], [{x=1234, y=[{l=[a, b, c], m=a, n=1}, {l=[a, b, c], m=a, n=1}], z={a=[1.1, 2.2, 3.3], b=true}}], [{x=1234, z={a=[1.1, 2.2, 3.3], b=true}}], [{x=11, y=[], z={a=[null], b=false}}], [{x=4.4, y=[{l=[], m=100, n=3}, {l=[a]}, {l=[b], n=[]}], z={a=[], b=true}}]]",
|
||||
resultsSegments.get(0).getEvents().toString()
|
||||
);
|
||||
} else {
|
||||
Assert.assertEquals(
|
||||
"[[{x=400, y=[{l=[null], m=100, n=5}, {l=[a, b, c], m=a, n=1}], z={}}], [{x=10, y=[{l=[b, b, c], m=b, n=2}, [1, 2, 3]], z={a=[5.5], b=false}}], [{x=1234, y=[{l=[a, b, c], m=a, n=1}, {l=[a, b, c], m=a, n=1}], z={a=[1.1, 2.2, 3.3], b=true}}], [{x=1234, z={a=[1.1, 2.2, 3.3], b=true}}], [{x=11, y=[], z={a=[null], b=false}}], [{x=4.4, y=[{l=[], m=100, n=3}, {l=[a]}, {l=[b], n=[]}], z={a=[], b=true}}]]",
|
||||
resultsSegments.get(0).getEvents().toString()
|
||||
);
|
||||
}
|
||||
Assert.assertEquals(resultsSegments.get(0).getEvents().toString(), resultsRealtime.get(0).getEvents().toString());
|
||||
}
|
||||
|
||||
private static void logResults(List<ScanResultValue> results)
|
||||
{
|
||||
StringBuilder bob = new StringBuilder();
|
||||
|
|
Loading…
Reference in New Issue