From 69aac6c8dd5588ed19f0cf90f11025317133df57 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Fri, 20 May 2022 01:51:28 -0700 Subject: [PATCH] Direct UTF-8 access for "in" filters. (#12517) * Direct UTF-8 access for "in" filters. Directly related: 1) InDimFilter: Store stored Strings (in ValuesSet) plus sorted UTF-8 ByteBuffers (in valuesUtf8). Use valuesUtf8 whenever possible. If necessary, the input set is copied into a ValuesSet. Much logic is simplified, because we always know what type the values set will be. I think that there won't even be an efficiency loss in most cases. InDimFilter is most frequently created by deserialization, and this patch updates the JsonCreator constructor to deserialize directly into a ValuesSet. 2) Add Utf8ValueSetIndex, which InDimFilter uses to avoid UTF-8 decodes during index lookups. 3) Add unsigned comparator to ByteBufferUtils and use it in GenericIndexed.BYTE_BUFFER_STRATEGY. This is important because UTF-8 bytes can be compared as bytes if, and only if, the comparison is unsigned. 4) Add specialization to GenericIndexed.singleThreaded().indexOf that avoids needless ByteBuffer allocations. 5) Clarify that objects returned by ColumnIndexSupplier.as are not thread-safe. DictionaryEncodedStringIndexSupplier now calls singleThreaded() on all relevant GenericIndexed objects, saving a ByteBuffer allocation per access. Also: 1) Fix performance regression in LikeFilter: since #12315, it applied the suffix matcher to all values in range even for type MATCH_ALL. 2) Add ObjectStrategy.canCompare() method. This fixes LikeFilterBenchmark, which was broken due to calls to strategy.compare in GenericIndexed.fromIterable. * Add like-filter implementation tests. * Add in-filter implementation tests. * Add tests, fix issues. * Fix style. * Adjustments from review. --- .../druid/benchmark/BoundFilterBenchmark.java | 11 +- .../DimensionPredicateFilterBenchmark.java | 21 +-- .../druid/benchmark/InFilterBenchmark.java | 133 ++++++++++++++ .../druid/benchmark/LikeFilterBenchmark.java | 13 +- .../java/util/common/ByteBufferUtils.java | 79 ++++++++ .../druid/java/util/common/StringUtils.java | 15 ++ .../druid/common/utils/StringUtilsTest.java | 11 ++ .../java/util/common/ByteBufferUtilsTest.java | 52 ++++++ .../java/org/apache/druid/query/Druids.java | 2 +- .../druid/query/filter/InDimFilter.java | 169 ++++++++++-------- .../druid/query/filter/LikeDimFilter.java | 5 +- .../druid/query/filter/SelectorDimFilter.java | 5 +- .../druid/query/topn/TopNQueryBuilder.java | 6 +- .../org/apache/druid/segment/IndexIO.java | 1 + .../segment/column/ColumnIndexSupplier.java | 2 + .../column/LexicographicalRangeIndex.java | 16 +- .../segment/column/StringValueSetIndex.java | 7 +- .../segment/column/Utf8ValueSetIndex.java | 35 ++++ .../data/ConciseBitmapSerdeFactory.java | 7 + ...DecompressingByteBufferObjectStrategy.java | 6 + .../druid/segment/data/GenericIndexed.java | 97 ++++++++-- .../segment/data/GenericIndexedWriter.java | 6 + .../druid/segment/data/ObjectStrategy.java | 8 + .../data/RoaringBitmapSerdeFactory.java | 6 + .../druid/segment/filter/LikeFilter.java | 7 +- .../apache/druid/segment/join/Joinable.java | 21 ++- .../join/table/IndexedTableJoinable.java | 16 +- .../DictionaryEncodedColumnPartSerde.java | 1 + .../DictionaryEncodedStringIndexSupplier.java | 135 ++++++++++---- .../virtual/ListFilteredVirtualColumn.java | 15 +- .../druid/query/filter/InDimFilterTest.java | 73 +++++++- .../druid/query/filter/LikeDimFilterTest.java | 57 ++++++ .../filter/ExtractionDimFilterTest.java | 6 + .../druid/segment/filter/InFilterTest.java | 2 +- .../segment/join/JoinFilterAnalyzerTest.java | 54 +++--- .../ArrayOverlapOperatorConversion.java | 4 +- .../filtration/ConvertSelectorsToIns.java | 4 +- 37 files changed, 888 insertions(+), 220 deletions(-) create mode 100644 benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java create mode 100644 processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java index eecb15ed04c..cdb3cf2f7ca 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java @@ -27,6 +27,7 @@ import org.apache.druid.collections.bitmap.MutableBitmap; import org.apache.druid.collections.bitmap.RoaringBitmapFactory; import org.apache.druid.common.config.NullHandling; import org.apache.druid.extendedset.intset.ConciseSetUtils; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.filter.BoundDimFilter; import org.apache.druid.query.filter.ColumnIndexSelector; import org.apache.druid.query.ordering.StringComparators; @@ -48,6 +49,7 @@ import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; @@ -160,7 +162,7 @@ public class BoundFilterBenchmark final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null); final List ints = generateInts(); final GenericIndexed dictionary = GenericIndexed.fromIterable( - FluentIterable.from(ints).transform(i -> i.toString()), + FluentIterable.from(ints).transform(Object::toString), GenericIndexed.STRING_STRATEGY ); final GenericIndexed bitmaps = GenericIndexed.fromIterable( @@ -174,9 +176,14 @@ public class BoundFilterBenchmark ), serdeFactory.getObjectStrategy() ); + final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( + FluentIterable.from(ints) + .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), + GenericIndexed.BYTE_BUFFER_STRATEGY + ); selector = new MockColumnIndexSelector( bitmapFactory, - new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, bitmaps, null) + new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null) ); } diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java index 09e13bd79d4..f01b09a79fe 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java @@ -19,7 +19,6 @@ package org.apache.druid.benchmark; -import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.collect.FluentIterable; @@ -28,6 +27,7 @@ import org.apache.druid.collections.bitmap.ImmutableBitmap; import org.apache.druid.collections.bitmap.MutableBitmap; import org.apache.druid.collections.bitmap.RoaringBitmapFactory; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.filter.ColumnIndexSelector; import org.apache.druid.query.filter.DruidDoublePredicate; import org.apache.druid.query.filter.DruidFloatPredicate; @@ -51,6 +51,7 @@ import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; @@ -123,18 +124,14 @@ public class DimensionPredicateFilterBenchmark final List ints = generateInts(); final GenericIndexed dictionary = GenericIndexed.fromIterable( FluentIterable.from(ints) - .transform( - new Function() - { - @Override - public String apply(Integer i) - { - return i.toString(); - } - } - ), + .transform(Object::toString), GenericIndexed.STRING_STRATEGY ); + final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( + FluentIterable.from(ints) + .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), + GenericIndexed.BYTE_BUFFER_STRATEGY + ); final GenericIndexed bitmaps = GenericIndexed.fromIterable( FluentIterable.from(ints) .transform( @@ -148,7 +145,7 @@ public class DimensionPredicateFilterBenchmark ); selector = new MockColumnIndexSelector( bitmapFactory, - new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, bitmaps, null) + new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null) ); } diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java new file mode 100644 index 00000000000..9556be4cb90 --- /dev/null +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.benchmark; + +import com.google.common.collect.FluentIterable; +import org.apache.druid.collections.bitmap.BitmapFactory; +import org.apache.druid.collections.bitmap.ImmutableBitmap; +import org.apache.druid.collections.bitmap.MutableBitmap; +import org.apache.druid.collections.bitmap.RoaringBitmapFactory; +import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.StringUtils; +import org.apache.druid.query.filter.ColumnIndexSelector; +import org.apache.druid.query.filter.InDimFilter; +import org.apache.druid.segment.data.BitmapSerdeFactory; +import org.apache.druid.segment.data.GenericIndexed; +import org.apache.druid.segment.data.RoaringBitmapSerdeFactory; +import org.apache.druid.segment.filter.Filters; +import org.apache.druid.segment.serde.DictionaryEncodedStringIndexSupplier; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.infra.Blackhole; + +import java.nio.ByteBuffer; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +@State(Scope.Benchmark) +@Fork(value = 1) +@Warmup(iterations = 10) +@Measurement(iterations = 10) +public class InFilterBenchmark +{ + static { + NullHandling.initializeForTests(); + } + + private static final int START_INT = 10_000_000; + + private InDimFilter inFilter; + + // cardinality of the dictionary. it will contain this many ints (as strings, of course), starting at START_INT, + // even numbers only. + @Param({"1000000"}) + int dictionarySize; + + // cardinality of the "in" filter. half of its values will be in the dictionary, half will not. + @Param({"10000"}) + int filterSize; + + // selector will contain a "dictionarySize" number of bitmaps; each one contains a single int. + // this benchmark is not about bitmap union speed, so no need for that part to be realistic. + ColumnIndexSelector selector; + + @Setup + public void setup() + { + final BitmapFactory bitmapFactory = new RoaringBitmapFactory(); + final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null); + final Iterable ints = intGenerator(); + final GenericIndexed dictionary = GenericIndexed.fromIterable( + FluentIterable.from(ints) + .transform(Object::toString), + GenericIndexed.STRING_STRATEGY + ); + final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( + FluentIterable.from(ints) + .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), + GenericIndexed.BYTE_BUFFER_STRATEGY + ); + final GenericIndexed bitmaps = GenericIndexed.fromIterable( + () -> IntStream.range(0, dictionarySize) + .mapToObj( + i -> { + final MutableBitmap mutableBitmap = bitmapFactory.makeEmptyMutableBitmap(); + mutableBitmap.add(i); + return bitmapFactory.makeImmutableBitmap(mutableBitmap); + } + ) + .iterator(), + serdeFactory.getObjectStrategy() + ); + selector = new MockColumnIndexSelector( + bitmapFactory, + new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null) + ); + inFilter = new InDimFilter( + "dummy", + IntStream.range(START_INT, START_INT + filterSize).mapToObj(String::valueOf).collect(Collectors.toSet()) + ); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.MICROSECONDS) + public void doFilter(Blackhole blackhole) + { + final ImmutableBitmap bitmapIndex = Filters.computeDefaultBitmapResults(inFilter, selector); + blackhole.consume(bitmapIndex); + } + + private Iterable intGenerator() + { + // i * 2 => half of these values will be present in the inFilter, half won't. + return () -> IntStream.range(0, dictionarySize).map(i -> START_INT + i * 2).boxed().iterator(); + } +} diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java index d644eb5f2fd..1369d69787d 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java @@ -25,6 +25,7 @@ import org.apache.druid.collections.bitmap.ImmutableBitmap; import org.apache.druid.collections.bitmap.MutableBitmap; import org.apache.druid.collections.bitmap.RoaringBitmapFactory; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.filter.BoundDimFilter; import org.apache.druid.query.filter.ColumnIndexSelector; import org.apache.druid.query.filter.Filter; @@ -50,6 +51,7 @@ import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; @@ -122,11 +124,14 @@ public class LikeFilterBenchmark final List ints = generateInts(); final GenericIndexed dictionary = GenericIndexed.fromIterable( FluentIterable.from(ints) - .transform( - i -> i.toString() - ), + .transform(Object::toString), GenericIndexed.STRING_STRATEGY ); + final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( + FluentIterable.from(ints) + .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), + GenericIndexed.BYTE_BUFFER_STRATEGY + ); final GenericIndexed bitmaps = GenericIndexed.fromIterable( FluentIterable.from(ints) .transform( @@ -140,7 +145,7 @@ public class LikeFilterBenchmark ); selector = new MockColumnIndexSelector( bitmapFactory, - new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, bitmaps, null) + new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null) ); } diff --git a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java index 29c46dc69a6..388112378e4 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java @@ -23,12 +23,14 @@ import org.apache.druid.collections.ResourceHolder; import org.apache.druid.java.util.common.logger.Logger; import org.apache.druid.utils.JvmUtils; +import javax.annotation.Nullable; import java.lang.invoke.MethodHandle; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; import java.lang.reflect.Method; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; +import java.util.Comparator; import java.util.Objects; import java.util.concurrent.atomic.AtomicBoolean; @@ -48,6 +50,8 @@ public class ByteBufferUtils // null if unmap is supported private static final RuntimeException UNMAP_NOT_SUPPORTED_EXCEPTION; + private static final Comparator COMPARATOR_UNSIGNED = new UnsignedByteBufferComparator(); + static { Object unmap = null; RuntimeException exception = null; @@ -211,4 +215,79 @@ public class ByteBufferUtils { free(buffer); } + + /** + * Compares two ByteBuffer ranges using unsigned byte ordering. + * + * Different from {@link ByteBuffer#compareTo}, which uses signed ordering. + */ + public static int compareByteBuffers( + final ByteBuffer buf1, + final int position1, + final int length1, + final ByteBuffer buf2, + final int position2, + final int length2 + ) + { + final int commonLength = Math.min(length1, length2); + + for (int i = 0; i < commonLength; i++) { + final byte byte1 = buf1.get(position1 + i); + final byte byte2 = buf2.get(position2 + i); + final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison + if (cmp != 0) { + return cmp; + } + } + + return Integer.compare(length1, length2); + } + + /** + * Compares two ByteBuffers from their positions to their limits using unsigned byte ordering. Accepts null + * buffers, which are ordered earlier than any nonnull buffer. + * + * Different from {@link ByteBuffer#compareTo}, which uses signed ordering. + */ + public static int compareByteBuffers( + @Nullable final ByteBuffer buf1, + @Nullable final ByteBuffer buf2 + ) + { + if (buf1 == null) { + return buf2 == null ? 0 : -1; + } + + if (buf2 == null) { + return 1; + } + + return ByteBufferUtils.compareByteBuffers( + buf1, + buf1.position(), + buf1.remaining(), + buf2, + buf2.position(), + buf2.remaining() + ); + } + + /** + * Comparator that compares two {@link ByteBuffer} using unsigned ordering. Null buffers are accepted, and + * are ordered earlier than any nonnull buffer. + */ + public static Comparator unsignedComparator() + { + return COMPARATOR_UNSIGNED; + } + + private static class UnsignedByteBufferComparator implements Comparator + { + @Override + public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2) + { + return ByteBufferUtils.compareByteBuffers(o1, o2); + } + } } diff --git a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java index b076f98bcfe..fd74e48d9a6 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java @@ -108,6 +108,11 @@ public class StringUtils return StringUtils.fromUtf8(buffer, buffer.remaining()); } + /** + * Converts a string to a UTF-8 byte array. + * + * @throws NullPointerException if "string" is null + */ public static byte[] toUtf8(final String string) { try { @@ -119,6 +124,16 @@ public class StringUtils } } + /** + * Converts a string to UTF-8 bytes, returning them as a newly-allocated on-heap {@link ByteBuffer}. + * If "string" is null, returns null. + */ + @Nullable + public static ByteBuffer toUtf8ByteBuffer(@Nullable final String string) + { + return string == null ? null : ByteBuffer.wrap(toUtf8(string)); + } + /** * Encodes "string" into the buffer "byteBuffer", using no more than the number of bytes remaining in the buffer. * Will only encode whole characters. The byteBuffer's position and limit may be changed during operation, but will diff --git a/core/src/test/java/org/apache/druid/common/utils/StringUtilsTest.java b/core/src/test/java/org/apache/druid/common/utils/StringUtilsTest.java index 1ee6643e2f6..787d5daa79d 100644 --- a/core/src/test/java/org/apache/druid/common/utils/StringUtilsTest.java +++ b/core/src/test/java/org/apache/druid/common/utils/StringUtilsTest.java @@ -23,7 +23,10 @@ import org.apache.druid.java.util.common.StringUtils; import org.junit.Assert; import org.junit.Test; +import java.nio.ByteBuffer; + /** + * */ public class StringUtilsTest { @@ -76,4 +79,12 @@ public class StringUtilsTest Assert.assertEquals(4, StringUtils.estimatedBinaryLengthAsUTF8(invalid)); } + @Test + public void testToUtf8ByteBuffer() + { + Assert.assertNull(StringUtils.toUtf8ByteBuffer(null)); + Assert.assertEquals(ByteBuffer.allocate(0), StringUtils.toUtf8ByteBuffer("")); + Assert.assertEquals(ByteBuffer.wrap(StringUtils.toUtf8("foo")), StringUtils.toUtf8ByteBuffer("foo")); + Assert.assertEquals(ByteBuffer.wrap(StringUtils.toUtf8("🙂")), StringUtils.toUtf8ByteBuffer("🙂")); + } } diff --git a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java index 91a37dba427..f5acb6e0308 100644 --- a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java +++ b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java @@ -21,6 +21,8 @@ package org.apache.druid.java.util.common; import com.google.common.io.Files; import org.apache.druid.collections.ResourceHolder; +import org.hamcrest.MatcherAssert; +import org.hamcrest.Matchers; import org.junit.Assert; import org.junit.Rule; import org.junit.Test; @@ -33,6 +35,7 @@ import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.util.Arrays; +import java.util.Comparator; public class ByteBufferUtilsTest { @@ -76,4 +79,53 @@ public class ByteBufferUtilsTest final ByteBuffer heapBuffer = ByteBuffer.allocate(4096); ByteBufferUtils.free(heapBuffer); } + + @Test + @SuppressWarnings("EqualsWithItself") + public void testUnsignedComparator() + { + final Comparator comparator = ByteBufferUtils.unsignedComparator(); + + // Tests involving null + MatcherAssert.assertThat(comparator.compare(null, null), Matchers.equalTo(0)); + MatcherAssert.assertThat(comparator.compare(null, ByteBuffer.allocate(0)), Matchers.lessThan(0)); + MatcherAssert.assertThat(comparator.compare(ByteBuffer.allocate(0), null), Matchers.greaterThan(0)); + MatcherAssert.assertThat(comparator.compare(null, ByteBuffer.allocate(1)), Matchers.lessThan(0)); + MatcherAssert.assertThat(comparator.compare(ByteBuffer.allocate(1), null), Matchers.greaterThan(0)); + MatcherAssert.assertThat(comparator.compare(null, ByteBuffer.wrap(new byte[]{-1})), Matchers.lessThan(0)); + MatcherAssert.assertThat(comparator.compare(ByteBuffer.wrap(new byte[]{-1}), null), Matchers.greaterThan(0)); + + // Tests involving buffers of different lengths + MatcherAssert.assertThat( + comparator.compare( + ByteBuffer.wrap(new byte[]{1, 2, 3}), + ByteBuffer.wrap(new byte[]{1, 2, 3, 4}) + ), + Matchers.lessThan(0) + ); + + MatcherAssert.assertThat( + comparator.compare( + ByteBuffer.wrap(new byte[]{1, 2, 3, 4}), + ByteBuffer.wrap(new byte[]{1, 2, 3}) + ), + Matchers.greaterThan(0) + ); + + // Tests involving the full range of bytes + for (byte i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) { + for (byte j = Byte.MIN_VALUE; j < Byte.MAX_VALUE; j++) { + final int cmp = Integer.compare(Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)); + + MatcherAssert.assertThat( + StringUtils.format("comparison of %s to %s", Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)), + comparator.compare( + ByteBuffer.wrap(new byte[]{i}), + ByteBuffer.wrap(new byte[]{j}) + ), + cmp < 0 ? Matchers.lessThan(0) : cmp > 0 ? Matchers.greaterThan(0) : Matchers.equalTo(0) + ); + } + } + } } diff --git a/processing/src/main/java/org/apache/druid/query/Druids.java b/processing/src/main/java/org/apache/druid/query/Druids.java index 6b7739ab66c..a760d1745d8 100644 --- a/processing/src/main/java/org/apache/druid/query/Druids.java +++ b/processing/src/main/java/org/apache/druid/query/Druids.java @@ -210,7 +210,7 @@ public class Druids { final Set filterValues = Sets.newHashSet(values); filterValues.add(value); - dimFilter = new InDimFilter(dimensionName, filterValues, null, null); + dimFilter = new InDimFilter(dimensionName, filterValues); return this; } diff --git a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java index 2c363dc4ade..74270b65c5c 100644 --- a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java +++ b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java @@ -29,11 +29,11 @@ import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.base.Supplier; import com.google.common.base.Suppliers; +import com.google.common.collect.ForwardingSortedSet; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Range; import com.google.common.collect.RangeSet; -import com.google.common.collect.Sets; import com.google.common.collect.TreeRangeSet; import com.google.common.hash.Hasher; import com.google.common.hash.Hashing; @@ -42,6 +42,7 @@ import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.ByteBufferUtils; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.guava.Comparators; @@ -59,25 +60,26 @@ import org.apache.druid.segment.DimensionHandlerUtils; import org.apache.druid.segment.column.BitmapColumnIndex; import org.apache.druid.segment.column.ColumnIndexSupplier; import org.apache.druid.segment.column.StringValueSetIndex; +import org.apache.druid.segment.column.Utf8ValueSetIndex; import org.apache.druid.segment.filter.Filters; import org.apache.druid.segment.vector.VectorColumnSelectorFactory; import javax.annotation.Nullable; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; import java.util.Collection; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.SortedSet; +import java.util.TreeSet; public class InDimFilter extends AbstractOptimizableDimFilter implements Filter { - // Values can contain `null` object - private final Set values; + // Values can contain `null` object. Values are sorted (nulls-first). + private final ValuesSet values; + // Computed eagerly, not lazily, because lazy computations would block all processing threads for a given query. + private final SortedSet valuesUtf8; private final String dimension; @Nullable private final ExtractionFn extractionFn; @@ -103,7 +105,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter @JsonProperty("dimension") String dimension, // This 'values' collection instance can be reused if possible to avoid copying a big collection. // Callers should _not_ modify the collection after it is passed to this constructor. - @JsonProperty("values") Set values, + @JsonProperty("values") ValuesSet values, @JsonProperty("extractionFn") @Nullable ExtractionFn extractionFn, @JsonProperty("filterTuning") @Nullable FilterTuning filterTuning ) @@ -121,32 +123,35 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter * Creates a new filter without an extraction function or any special filter tuning. * * @param dimension column to search - * @param values set of values to match. This collection may be reused to avoid copying a big collection. - * Therefore, callers should not modify the collection after it is passed to this - * constructor. + * @param values set of values to match. If this collection is a {@link SortedSet}, it may be reused to avoid + * copying a big collection. Therefore, callers should not modify the collection after it + * is passed to this constructor. */ - public InDimFilter( - String dimension, - Set values - ) + public InDimFilter(String dimension, Set values) { this( dimension, - values, + values instanceof ValuesSet ? (ValuesSet) values : new ValuesSet(values), + null, null, null ); } /** - * This constructor should be called only in unit tests since accepting a Collection makes copying more likely. + * Creates a new filter without an extraction function or any special filter tuning. + * + * @param dimension column to search + * @param values set of values to match. If this collection is a {@link SortedSet}, it may be reused to avoid + * copying a big collection. Therefore, callers should not modify the collection after it + * is passed to this constructor. + * @param extractionFn extraction function to apply to the column before checking against "values" */ - @VisibleForTesting public InDimFilter(String dimension, Collection values, @Nullable ExtractionFn extractionFn) { this( dimension, - values instanceof Set ? (Set) values : new HashSet<>(values), + values instanceof ValuesSet ? (ValuesSet) values : new ValuesSet(values), extractionFn, null, null @@ -158,7 +163,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter */ private InDimFilter( final String dimension, - final Set values, + final ValuesSet values, @Nullable final ExtractionFn extractionFn, @Nullable final FilterTuning filterTuning, @Nullable final DruidPredicateFactory predicateFactory @@ -166,19 +171,15 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter { Preconditions.checkNotNull(values, "values cannot be null"); - // The values set can be huge. Try to avoid copying the set if possible. - // Note that we may still need to copy values to a list for caching. See getCacheKey(). + this.values = values; + if (!NullHandling.sqlCompatible() && values.contains("")) { - // In Non sql compatible mode, empty strings should be converted to nulls for the filter. - // In sql compatible mode, empty strings and nulls should be treated differently - this.values = Sets.newHashSetWithExpectedSize(values.size()); - for (String v : values) { - this.values.add(NullHandling.emptyToNullIfNeeded(v)); - } - } else { - this.values = values; + // In non-SQL-compatible mode, empty strings must be converted to nulls for the filter. + this.values.remove(""); + this.values.add(null); } + this.valuesUtf8 = this.values.toUtf8(); this.dimension = Preconditions.checkNotNull(dimension, "dimension cannot be null"); this.extractionFn = extractionFn; this.filterTuning = filterTuning; @@ -199,7 +200,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter } @JsonProperty - public Set getValues() + public SortedSet getValues() { return values; } @@ -296,9 +297,15 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter selector ); } - final StringValueSetIndex valueSetIndex = indexSupplier.as(StringValueSetIndex.class); - if (valueSetIndex != null) { - return valueSetIndex.forValues(values); + + final Utf8ValueSetIndex utf8ValueSetIndex = indexSupplier.as(Utf8ValueSetIndex.class); + if (utf8ValueSetIndex != null) { + return utf8ValueSetIndex.forSortedValuesUtf8(valuesUtf8); + } + + final StringValueSetIndex stringValueSetIndex = indexSupplier.as(StringValueSetIndex.class); + if (stringValueSetIndex != null) { + return stringValueSetIndex.forSortedValues(values); } } return Filters.makePredicateIndex( @@ -399,20 +406,9 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter private byte[] computeCacheKey() { - final Collection sortedValues; - - if (values instanceof SortedSet && isNaturalOrder(((SortedSet) values).comparator())) { - // Avoid copying "values" when it is already in the order we need for cache key computation. - sortedValues = values; - } else { - final List sortedValuesList = new ArrayList<>(values); - sortedValuesList.sort(Comparators.naturalNullsFirst()); - sortedValues = sortedValuesList; - } - // Hash all values, in sorted order, as their length followed by their content. final Hasher hasher = Hashing.sha256().newHasher(); - for (String v : sortedValues) { + for (String v : values) { if (v == null) { // Encode null as length -1, no content. hasher.putInt(-1); @@ -438,7 +434,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter LookupExtractionFn exFn = (LookupExtractionFn) extractionFn; LookupExtractor lookup = exFn.getLookup(); - final Set keys = new HashSet<>(); + final ValuesSet keys = new ValuesSet(); for (String value : values) { // We cannot do an unapply()-based optimization if the selector value @@ -468,40 +464,11 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter return this; } - /** - * Returns true if the comparator is null or the singleton {@link Comparators#naturalNullsFirst()}. Useful for - * detecting if a sorted set is in natural order or not. - * - * May return false negatives (i.e. there are naturally-ordered comparators that will return false here). - */ - private static boolean isNaturalOrder(@Nullable final Comparator comparator) - { - return comparator == null || Comparators.naturalNullsFirst().equals(comparator); - } - @SuppressWarnings("ReturnValueIgnored") private static Predicate createStringPredicate(final Set values) { Preconditions.checkNotNull(values, "values"); - - try { - // Check to see if values.contains(null) will throw a NullPointerException. Jackson JSON deserialization won't - // lead to this (it will create a HashSet, which can accept nulls). But when InDimFilters are created - // programmatically as a result of optimizations like rewriting inner joins as filters, the passed-in Set may - // not be able to accept nulls. We don't want to copy the Sets (since they may be large) so instead we'll wrap - // it in a null-checking lambda if needed. - values.contains(null); - - // Safe to do values.contains(null). - return values::contains; - } - catch (NullPointerException ignored) { - // Fall through - } - - // Not safe to do values.contains(null); must return a wrapper. - // Return false for null, since an exception means the set cannot accept null (and therefore does not include it). - return value -> value != null && values.contains(value); + return values::contains; } private static DruidLongPredicate createLongPredicate(final Set values) @@ -558,7 +525,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter InFilterDruidPredicateFactory( final ExtractionFn extractionFn, - final Set values + final ValuesSet values ) { this.extractionFn = extractionFn; @@ -638,4 +605,50 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter return Objects.hash(extractionFn, values); } } + + public static class ValuesSet extends ForwardingSortedSet + { + private final SortedSet values; + + public ValuesSet() + { + this.values = new TreeSet<>(Comparators.naturalNullsFirst()); + } + + /** + * Create a ValuesSet from another Collection. The Collection will be reused if it is a {@link SortedSet} with + * an appropriate comparator. + */ + public ValuesSet(final Collection values) + { + if (values instanceof SortedSet && Comparators.naturalNullsFirst() + .equals(((SortedSet) values).comparator())) { + this.values = (SortedSet) values; + } else { + this.values = new TreeSet<>(Comparators.naturalNullsFirst()); + this.values.addAll(values); + } + } + + public SortedSet toUtf8() + { + final TreeSet valuesUtf8 = new TreeSet<>(ByteBufferUtils.unsignedComparator()); + + for (final String value : values) { + if (value == null) { + valuesUtf8.add(null); + } else { + valuesUtf8.add(ByteBuffer.wrap(StringUtils.toUtf8(value))); + } + } + + return valuesUtf8; + } + + @Override + protected SortedSet delegate() + { + return values; + } + } } diff --git a/processing/src/main/java/org/apache/druid/query/filter/LikeDimFilter.java b/processing/src/main/java/org/apache/druid/query/filter/LikeDimFilter.java index 50a06b2392c..f878edc9ce4 100644 --- a/processing/src/main/java/org/apache/druid/query/filter/LikeDimFilter.java +++ b/processing/src/main/java/org/apache/druid/query/filter/LikeDimFilter.java @@ -294,9 +294,8 @@ public class LikeDimFilter extends AbstractOptimizableDimFilter implements DimFi } /** - * Checks if the suffix of strings.get(i) matches the suffix of this matcher. The first prefix.length characters - * of s are ignored. This method is useful if you've already independently verified the prefix. This method - * evalutes strings.get(i) lazily to save time when it isn't necessary to actually look at the string. + * Checks if the suffix of "value" matches the suffix of this matcher. The first prefix.length() characters + * of "value" are ignored. This method is useful if you've already independently verified the prefix. */ public boolean matchesSuffixOnly(@Nullable String value) { diff --git a/processing/src/main/java/org/apache/druid/query/filter/SelectorDimFilter.java b/processing/src/main/java/org/apache/druid/query/filter/SelectorDimFilter.java index c717dbead31..340943a0842 100644 --- a/processing/src/main/java/org/apache/druid/query/filter/SelectorDimFilter.java +++ b/processing/src/main/java/org/apache/druid/query/filter/SelectorDimFilter.java @@ -34,7 +34,6 @@ import org.apache.druid.segment.filter.DimensionPredicateFilter; import org.apache.druid.segment.filter.SelectorFilter; import javax.annotation.Nullable; -import java.util.Collections; import java.util.Objects; import java.util.Set; @@ -96,7 +95,9 @@ public class SelectorDimFilter extends AbstractOptimizableDimFilter implements D @Override public DimFilter optimize() { - return new InDimFilter(dimension, Collections.singleton(value), extractionFn, filterTuning).optimize(); + final InDimFilter.ValuesSet valuesSet = new InDimFilter.ValuesSet(); + valuesSet.add(value); + return new InDimFilter(dimension, valuesSet, extractionFn, filterTuning).optimize(); } @Override diff --git a/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java b/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java index 6699085a1e1..437f9fab101 100644 --- a/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java +++ b/processing/src/main/java/org/apache/druid/query/topn/TopNQueryBuilder.java @@ -20,9 +20,9 @@ package org.apache.druid.query.topn; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Sets; import org.apache.druid.java.util.common.granularity.Granularities; import org.apache.druid.java.util.common.granularity.Granularity; +import org.apache.druid.java.util.common.guava.Comparators; import org.apache.druid.query.BaseQuery; import org.apache.druid.query.DataSource; import org.apache.druid.query.TableDataSource; @@ -45,6 +45,7 @@ import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeSet; import java.util.UUID; /** @@ -234,7 +235,8 @@ public class TopNQueryBuilder public TopNQueryBuilder filters(String dimensionName, String value, String... values) { - final Set filterValues = Sets.newHashSet(values); + final Set filterValues = new TreeSet<>(Comparators.naturalNullsFirst()); + filterValues.addAll(Arrays.asList(values)); filterValues.add(value); dimFilter = new InDimFilter(dimensionName, filterValues); return this; diff --git a/processing/src/main/java/org/apache/druid/segment/IndexIO.java b/processing/src/main/java/org/apache/druid/segment/IndexIO.java index 6c5b1e8235e..6068e8c748b 100644 --- a/processing/src/main/java/org/apache/druid/segment/IndexIO.java +++ b/processing/src/main/java/org/apache/druid/segment/IndexIO.java @@ -460,6 +460,7 @@ public class IndexIO new DictionaryEncodedStringIndexSupplier( new ConciseBitmapFactory(), index.getDimValueLookup(dimension), + index.getDimValueUtf8Lookup(dimension), bitmaps, spatialIndex ), diff --git a/processing/src/main/java/org/apache/druid/segment/column/ColumnIndexSupplier.java b/processing/src/main/java/org/apache/druid/segment/column/ColumnIndexSupplier.java index f5ec1688b45..7e2b21a25d0 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/ColumnIndexSupplier.java +++ b/processing/src/main/java/org/apache/druid/segment/column/ColumnIndexSupplier.java @@ -39,6 +39,8 @@ public interface ColumnIndexSupplier * {@link org.apache.druid.segment.data.Offset} to form the basis of a {@link org.apache.druid.segment.Cursor} * (or {@link org.apache.druid.segment.vector.VectorOffset} and {@link org.apache.druid.segment.vector.VectorCursor}) * which can greatly reduce the total number of rows which need to be scanned and processed. + * + * Objects returned by this method are not thread-safe. */ @Nullable T as(Class clazz); diff --git a/processing/src/main/java/org/apache/druid/segment/column/LexicographicalRangeIndex.java b/processing/src/main/java/org/apache/druid/segment/column/LexicographicalRangeIndex.java index d506ebc1c11..9568f671a43 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/LexicographicalRangeIndex.java +++ b/processing/src/main/java/org/apache/druid/segment/column/LexicographicalRangeIndex.java @@ -30,21 +30,21 @@ import javax.annotation.Nullable; public interface LexicographicalRangeIndex { /** - * Get an {@link BitmapColumnIndex} corresponding to the values supplied in the specified range + * Get a {@link BitmapColumnIndex} corresponding to the values supplied in the specified range. */ - default BitmapColumnIndex forRange( + BitmapColumnIndex forRange( @Nullable String startValue, boolean startStrict, @Nullable String endValue, boolean endStrict - ) - { - return forRange(startValue, startStrict, endValue, endStrict, (index) -> true); - } + ); /** - * Get an {@link BitmapColumnIndex} corresponding to the values supplied in the specified range whose dictionary ids - * also match some predicate, such as to match a prefix + * Get a {@link BitmapColumnIndex} corresponding to the values supplied in the specified range whose dictionary ids + * also match some predicate, such as to match a prefix. + * + * If the provided {@code} matcher is always true, it's better to use the other + * {@link #forRange(String, boolean, String, boolean)} method. */ BitmapColumnIndex forRange( @Nullable String startValue, diff --git a/processing/src/main/java/org/apache/druid/segment/column/StringValueSetIndex.java b/processing/src/main/java/org/apache/druid/segment/column/StringValueSetIndex.java index 72d3e63ce84..c53c1a9be98 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/StringValueSetIndex.java +++ b/processing/src/main/java/org/apache/druid/segment/column/StringValueSetIndex.java @@ -22,7 +22,7 @@ package org.apache.druid.segment.column; import org.apache.druid.collections.bitmap.ImmutableBitmap; import javax.annotation.Nullable; -import java.util.Set; +import java.util.SortedSet; /** * Index on individual values, and provides bitmaps for the rows which contain these values @@ -36,7 +36,8 @@ public interface StringValueSetIndex /** * Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the specified set of values (if they are - * contained in the underlying column) + * contained in the underlying column). The set must be sorted using + * {@link org.apache.druid.java.util.common.guava.Comparators#naturalNullsFirst()}. */ - BitmapColumnIndex forValues(Set values); + BitmapColumnIndex forSortedValues(SortedSet values); } diff --git a/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java new file mode 100644 index 00000000000..ef0d08ee0a3 --- /dev/null +++ b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.druid.segment.column; + +import org.apache.druid.collections.bitmap.ImmutableBitmap; + +import java.nio.ByteBuffer; +import java.util.SortedSet; + +public interface Utf8ValueSetIndex +{ + /** + * Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the specified set of values (if they are + * contained in the underlying column). The set must be sorted using + * {@link org.apache.druid.java.util.common.ByteBufferUtils#unsignedComparator()}. + */ + BitmapColumnIndex forSortedValuesUtf8(SortedSet valuesUtf8); +} diff --git a/processing/src/main/java/org/apache/druid/segment/data/ConciseBitmapSerdeFactory.java b/processing/src/main/java/org/apache/druid/segment/data/ConciseBitmapSerdeFactory.java index b81c971b370..70a1fcc6cc5 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/ConciseBitmapSerdeFactory.java +++ b/processing/src/main/java/org/apache/druid/segment/data/ConciseBitmapSerdeFactory.java @@ -28,6 +28,7 @@ import org.apache.druid.extendedset.intset.ImmutableConciseSet; import java.nio.ByteBuffer; /** + * */ public class ConciseBitmapSerdeFactory implements BitmapSerdeFactory { @@ -70,6 +71,12 @@ public class ConciseBitmapSerdeFactory implements BitmapSerdeFactory return val.toBytes(); } + @Override + public boolean canCompare() + { + return false; + } + @Override public int compare(ImmutableBitmap o1, ImmutableBitmap o2) { diff --git a/processing/src/main/java/org/apache/druid/segment/data/DecompressingByteBufferObjectStrategy.java b/processing/src/main/java/org/apache/druid/segment/data/DecompressingByteBufferObjectStrategy.java index 8ae83ea7511..9c77d884b71 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/DecompressingByteBufferObjectStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/data/DecompressingByteBufferObjectStrategy.java @@ -70,6 +70,12 @@ public class DecompressingByteBufferObjectStrategy implements ObjectStrategy o1, ResourceHolder o2) { diff --git a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java index 982f7b07672..4fd6417f0fb 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java +++ b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java @@ -24,6 +24,7 @@ import org.apache.druid.collections.ResourceHolder; import org.apache.druid.common.config.NullHandling; import org.apache.druid.common.utils.SerializerUtils; import org.apache.druid.io.Channels; +import org.apache.druid.java.util.common.ByteBufferUtils; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.guava.Comparators; @@ -132,9 +133,9 @@ public class GenericIndexed implements CloseableIndexed, Serializer } @Override - public int compare(ByteBuffer o1, ByteBuffer o2) + public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2) { - return o1.compareTo(o2); + return ByteBufferUtils.unsignedComparator().compare(o1, o2); } }; @@ -217,7 +218,7 @@ public class GenericIndexed implements CloseableIndexed, Serializer public static GenericIndexed fromIterable(Iterable objectsIterable, ObjectStrategy strategy) { - return fromIterableVersionOne(objectsIterable, strategy, true, strategy); + return fromIterableVersionOne(objectsIterable, strategy, strategy.canCompare(), strategy); } static int getNumberOfFilesRequired(int bagSize, long numWritten) @@ -350,17 +351,13 @@ public class GenericIndexed implements CloseableIndexed, Serializer if (!allowReverseLookup) { throw new UnsupportedOperationException("Reverse lookup not allowed."); } - return indexOf(this, value); - } - private int indexOf(Indexed indexed, @Nullable T value) - { int minIndex = 0; int maxIndex = size - 1; while (minIndex <= maxIndex) { int currIndex = (minIndex + maxIndex) >>> 1; - T currValue = indexed.get(currIndex); + T currValue = get(currIndex); int comparison = strategy.compare(currValue, value); if (comparison == 0) { return currIndex; @@ -458,6 +455,9 @@ public class GenericIndexed implements CloseableIndexed, Serializer return sb.toString(); } + /** + * Single-threaded view. + */ abstract class BufferIndexed implements Indexed { int lastReadSize; @@ -468,8 +468,23 @@ public class GenericIndexed implements CloseableIndexed, Serializer return size; } + @Override + public T get(final int index) + { + final ByteBuffer buf = getByteBuffer(index); + if (buf == null) { + return null; + } + + // Traditionally, ObjectStrategy.fromByteBuffer() is given a buffer with limit set to capacity, and the + // actual limit is passed along as an extra parameter. + final int len = buf.remaining(); + buf.limit(buf.capacity()); + return strategy.fromByteBuffer(buf, len); + } + @Nullable - T bufferedIndexedGet(ByteBuffer copyValueBuffer, int startOffset, int endOffset) + ByteBuffer bufferedIndexedGetByteBuffer(ByteBuffer copyValueBuffer, int startOffset, int endOffset) { int size = endOffset - startOffset; // When size is 0 and SQL compatibility is enabled also check for null marker before returning null. @@ -481,14 +496,22 @@ public class GenericIndexed implements CloseableIndexed, Serializer lastReadSize = size; // ObjectStrategy.fromByteBuffer() is allowed to reset the limit of the buffer. So if the limit is changed, - // position() call in the next line could throw an exception, if the position is set beyond the new limit. clear() - // sets the limit to the maximum possible, the capacity. It is safe to reset the limit to capacity, because the - // value buffer(s) initial limit equals to capacity. - copyValueBuffer.clear(); + // position() call could throw an exception, if the position is set beyond the new limit. Calling limit() + // followed by position() is safe, because limit() resets position if needed. + copyValueBuffer.limit(endOffset); copyValueBuffer.position(startOffset); - return strategy.fromByteBuffer(copyValueBuffer, size); + return copyValueBuffer; } + /** + * Like {@link #get(int)}, but returns a {@link ByteBuffer} instead of using the {@link ObjectStrategy}. + * + * The returned ByteBuffer is reused by future calls. Callers must discard it before calling another method + * on this BufferedIndexed object that may want to reuse the buffer. + */ + @Nullable + protected abstract ByteBuffer getByteBuffer(int index); + /** * This method makes no guarantees with respect to thread safety * @@ -502,7 +525,41 @@ public class GenericIndexed implements CloseableIndexed, Serializer @Override public int indexOf(@Nullable T value) { - return GenericIndexed.this.indexOf(this, value); + if (!allowReverseLookup) { + throw new UnsupportedOperationException("Reverse lookup not allowed."); + } + + //noinspection ObjectEquality + final boolean isByteBufferStrategy = strategy == BYTE_BUFFER_STRATEGY; + + int minIndex = 0; + int maxIndex = size - 1; + while (minIndex <= maxIndex) { + int currIndex = (minIndex + maxIndex) >>> 1; + + int comparison; + + if (isByteBufferStrategy) { + // Specialization avoids ByteBuffer allocation in strategy.fromByteBuffer. + ByteBuffer currValue = getByteBuffer(currIndex); + comparison = ByteBufferUtils.compareByteBuffers(currValue, (ByteBuffer) value); + } else { + T currValue = get(currIndex); + comparison = strategy.compare(currValue, value); + } + + if (comparison == 0) { + return currIndex; + } + + if (comparison < 0) { + minIndex = currIndex + 1; + } else { + maxIndex = currIndex - 1; + } + } + + return -(minIndex + 1); } @Override @@ -625,8 +682,9 @@ public class GenericIndexed implements CloseableIndexed, Serializer final ByteBuffer copyBuffer = firstValueBuffer.asReadOnlyBuffer(); return new BufferIndexed() { + @Nullable @Override - public T get(final int index) + protected ByteBuffer getByteBuffer(final int index) { checkIndex(index); @@ -641,7 +699,7 @@ public class GenericIndexed implements CloseableIndexed, Serializer startOffset = headerBuffer.getInt(headerPosition) + Integer.BYTES; endOffset = headerBuffer.getInt(headerPosition + Integer.BYTES); } - return bufferedIndexedGet(copyBuffer, startOffset, endOffset); + return bufferedIndexedGetByteBuffer(copyBuffer, startOffset, endOffset); } @Override @@ -734,8 +792,9 @@ public class GenericIndexed implements CloseableIndexed, Serializer return new BufferIndexed() { + @Nullable @Override - public T get(final int index) + protected ByteBuffer getByteBuffer(int index) { checkIndex(index); @@ -753,7 +812,7 @@ public class GenericIndexed implements CloseableIndexed, Serializer endOffset = headerBuffer.getInt(headerPosition + Integer.BYTES); } int fileNum = index >> logBaseTwoOfElementsPerValueFile; - return bufferedIndexedGet(copyValueBuffers[fileNum], startOffset, endOffset); + return bufferedIndexedGetByteBuffer(copyValueBuffers[fileNum], startOffset, endOffset); } @Override diff --git a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexedWriter.java b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexedWriter.java index 6db18d0c128..449ed791f32 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexedWriter.java +++ b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexedWriter.java @@ -124,6 +124,12 @@ public class GenericIndexedWriter implements Serializer val.position(valPos); } + @Override + public boolean canCompare() + { + return false; + } + @Override public int compare(ByteBuffer o1, ByteBuffer o2) { diff --git a/processing/src/main/java/org/apache/druid/segment/data/ObjectStrategy.java b/processing/src/main/java/org/apache/druid/segment/data/ObjectStrategy.java index ed4410b8218..8a53fc57a7d 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/ObjectStrategy.java +++ b/processing/src/main/java/org/apache/druid/segment/data/ObjectStrategy.java @@ -51,6 +51,14 @@ public interface ObjectStrategy extends Comparator @Nullable byte[] toBytes(@Nullable T val); + /** + * Whether {@link #compare} is valid or not. + */ + default boolean canCompare() + { + return true; + } + /** * Reads 4-bytes numBytes from the given buffer, and then delegates to {@link #fromByteBuffer(ByteBuffer, int)}. */ diff --git a/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java b/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java index 92b58d91e99..035511981b4 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java +++ b/processing/src/main/java/org/apache/druid/segment/data/RoaringBitmapSerdeFactory.java @@ -95,6 +95,12 @@ public class RoaringBitmapSerdeFactory implements BitmapSerdeFactory return val.toBytes(); } + @Override + public boolean canCompare() + { + return false; + } + @Override public int compare(ImmutableBitmap o1, ImmutableBitmap o2) { diff --git a/processing/src/main/java/org/apache/druid/segment/filter/LikeFilter.java b/processing/src/main/java/org/apache/druid/segment/filter/LikeFilter.java index 203c8e6a0e7..e4ffb7f428f 100644 --- a/processing/src/main/java/org/apache/druid/segment/filter/LikeFilter.java +++ b/processing/src/main/java/org/apache/druid/segment/filter/LikeFilter.java @@ -94,7 +94,12 @@ public class LikeFilter implements Filter if (rangeIndex != null) { final String lower = NullHandling.nullToEmptyIfNeeded(likeMatcher.getPrefix()); final String upper = NullHandling.nullToEmptyIfNeeded(likeMatcher.getPrefix()) + Character.MAX_VALUE; - return rangeIndex.forRange(lower, false, upper, false, likeMatcher::matchesSuffixOnly); + + if (likeMatcher.getSuffixMatch() == LikeDimFilter.LikeMatcher.SuffixMatch.MATCH_ANY) { + return rangeIndex.forRange(lower, false, upper, false); + } else { + return rangeIndex.forRange(lower, false, upper, false, likeMatcher::matchesSuffixOnly); + } } } diff --git a/processing/src/main/java/org/apache/druid/segment/join/Joinable.java b/processing/src/main/java/org/apache/druid/segment/join/Joinable.java index 25957f7e9a8..a09bac2f24a 100644 --- a/processing/src/main/java/org/apache/druid/segment/join/Joinable.java +++ b/processing/src/main/java/org/apache/druid/segment/join/Joinable.java @@ -71,10 +71,10 @@ public interface Joinable extends ReferenceCountedObject * @param condition join condition for the matcher * @param remainderNeeded whether or not {@link JoinMatcher#matchRemainder()} will ever be called on the * matcher. If we know it will not, additional optimizations are often possible. - * * @param descending true if join cursor is iterated in descending order * @param closer closer that will run after join cursor has completed to clean up any per query * resources the joinable uses + * * @return the matcher */ JoinMatcher makeJoinMatcher( @@ -89,6 +89,10 @@ public interface Joinable extends ReferenceCountedObject * Returns all nonnull values from a particular column if they are all unique, if there are "maxNumValues" or fewer, * and if the column exists and supports this operation. Otherwise, returns an empty Optional. * + * The returned set may be passed to {@link org.apache.druid.query.filter.InDimFilter}. For efficiency, + * implementations should prefer creating the returned set with + * {@code new TreeSet(Comparators.naturalNullsFirst()}}. This avoids a copy in the filter's constructor. + * * @param columnName name of the column * @param maxNumValues maximum number of values to return */ @@ -98,13 +102,18 @@ public interface Joinable extends ReferenceCountedObject * Searches a column from this Joinable for a particular value, finds rows that match, * and returns values of a second column for those rows. * - * @param searchColumnName Name of the search column. This is the column that is being used in the filter - * @param searchColumnValue Target value of the search column. This is the value that is being filtered on. - * @param retrievalColumnName The column to retrieve values from. This is the column that is being joined against. - * @param maxCorrelationSetSize Maximum number of values to retrieve. If we detect that more values would be - * returned than this limit, return absent. + * The returned set may be passed to {@link org.apache.druid.query.filter.InDimFilter}. For efficiency, + * implementations should prefer creating the returned set with + * {@code new TreeSet(Comparators.naturalNullsFirst()}}. This avoids a copy in the filter's constructor. + * + * @param searchColumnName Name of the search column. This is the column that is being used in the filter + * @param searchColumnValue Target value of the search column. This is the value that is being filtered on. + * @param retrievalColumnName The column to retrieve values from. This is the column that is being joined against. + * @param maxCorrelationSetSize Maximum number of values to retrieve. If we detect that more values would be + * returned than this limit, return absent. * @param allowNonKeyColumnSearch If true, allow searchs on non-key columns. If this is false, * a search on a non-key column returns absent. + * * @return The set of correlated column values. If we cannot determine correlated values, return absent. * * In case either the search or retrieval column names are not found, this will return absent. diff --git a/processing/src/main/java/org/apache/druid/segment/join/table/IndexedTableJoinable.java b/processing/src/main/java/org/apache/druid/segment/join/table/IndexedTableJoinable.java index c230b253fc2..840728c7849 100644 --- a/processing/src/main/java/org/apache/druid/segment/join/table/IndexedTableJoinable.java +++ b/processing/src/main/java/org/apache/druid/segment/join/table/IndexedTableJoinable.java @@ -33,7 +33,6 @@ import org.apache.druid.segment.join.Joinable; import javax.annotation.Nullable; import java.io.Closeable; import java.io.IOException; -import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -104,10 +103,7 @@ public class IndexedTableJoinable implements Joinable try (final IndexedTable.Reader reader = table.columnReader(columnPosition)) { // Sorted set to encourage "in" filters that result from this method to do dictionary lookups in order. // The hopes are that this will improve locality and therefore improve performance. - // - // Note: we are using Comparators.naturalNullsFirst() because it prevents the need for lambda-wrapping in - // InDimFilter's "createStringPredicate" method. - final Set allValues = new TreeSet<>(Comparators.naturalNullsFirst()); + final Set allValues = createValuesSet(); for (int i = 0; i < table.numRows(); i++) { final String s = DimensionHandlerUtils.convertObjectToString(reader.read(i)); @@ -147,7 +143,7 @@ public class IndexedTableJoinable implements Joinable return Optional.empty(); } try (final Closer closer = Closer.create()) { - Set correlatedValues = new HashSet<>(); + Set correlatedValues = createValuesSet(); if (table.keyColumns().contains(searchColumnName)) { IndexedTable.Index index = table.columnIndex(filterColumnPosition); IndexedTable.Reader reader = table.columnReader(correlatedColumnPosition); @@ -196,4 +192,12 @@ public class IndexedTableJoinable implements Joinable { return table.acquireReferences(); } + + /** + * Create a Set that InDimFilter will accept without incurring a copy. + */ + private static Set createValuesSet() + { + return new TreeSet<>(Comparators.naturalNullsFirst()); + } } diff --git a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java index 286e3702cc1..bf1e66df355 100644 --- a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java @@ -365,6 +365,7 @@ public class DictionaryEncodedColumnPartSerde implements ColumnPartSerde new DictionaryEncodedStringIndexSupplier( bitmapSerdeFactory.getBitmapFactory(), rDictionary, + rDictionaryUtf8, rBitmaps, rSpatialIndex ), diff --git a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplier.java b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplier.java index dffd624d904..0a1ee09d516 100644 --- a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplier.java +++ b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplier.java @@ -20,6 +20,7 @@ package org.apache.druid.segment.serde; import com.google.common.base.Predicate; +import com.google.common.collect.Iterables; import it.unimi.dsi.fastutil.ints.IntIntImmutablePair; import it.unimi.dsi.fastutil.ints.IntIntPair; import it.unimi.dsi.fastutil.ints.IntIterator; @@ -27,6 +28,7 @@ import org.apache.druid.collections.bitmap.BitmapFactory; import org.apache.druid.collections.bitmap.ImmutableBitmap; import org.apache.druid.collections.spatial.ImmutableRTree; import org.apache.druid.common.config.NullHandling; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.BitmapResultFactory; import org.apache.druid.query.filter.DruidPredicateFactory; import org.apache.druid.segment.IntListUtils; @@ -39,13 +41,16 @@ import org.apache.druid.segment.column.LexicographicalRangeIndex; import org.apache.druid.segment.column.SimpleColumnIndexCapabilities; import org.apache.druid.segment.column.SpatialIndex; import org.apache.druid.segment.column.StringValueSetIndex; +import org.apache.druid.segment.column.Utf8ValueSetIndex; import org.apache.druid.segment.data.GenericIndexed; +import org.apache.druid.segment.data.Indexed; import org.apache.druid.segment.filter.Filters; import javax.annotation.Nullable; +import java.nio.ByteBuffer; import java.util.Iterator; import java.util.NoSuchElementException; -import java.util.Set; +import java.util.SortedSet; public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier { @@ -53,6 +58,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier private final BitmapFactory bitmapFactory; private final GenericIndexed dictionary; + private final GenericIndexed dictionaryUtf8; @Nullable private final GenericIndexed bitmaps; @Nullable @@ -61,29 +67,38 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier public DictionaryEncodedStringIndexSupplier( BitmapFactory bitmapFactory, GenericIndexed dictionary, - GenericIndexed bitmaps, - ImmutableRTree indexedTree + GenericIndexed dictionaryUtf8, + @Nullable GenericIndexed bitmaps, + @Nullable ImmutableRTree indexedTree ) { this.bitmapFactory = bitmapFactory; - this.bitmaps = bitmaps; this.dictionary = dictionary; + this.dictionaryUtf8 = dictionaryUtf8; + this.bitmaps = bitmaps; this.indexedTree = indexedTree; } @Nullable @Override + @SuppressWarnings("unchecked") public T as(Class clazz) { - if (clazz.equals(StringValueSetIndex.class)) { - return (T) new GenericIndexedDictionaryEncodedStringValueSetIndex(bitmapFactory, dictionary, bitmaps); - } else if (clazz.equals(DruidPredicateIndex.class)) { + if (bitmaps != null && clazz.equals(StringValueSetIndex.class)) { + return (T) new GenericIndexedDictionaryEncodedStringValueSetIndex(bitmapFactory, dictionaryUtf8, bitmaps); + } else if (bitmaps != null && clazz.equals(Utf8ValueSetIndex.class)) { + return (T) new GenericIndexedDictionaryEncodedStringValueSetIndex(bitmapFactory, dictionaryUtf8, bitmaps); + } else if (bitmaps != null && clazz.equals(DruidPredicateIndex.class)) { return (T) new GenericIndexedDictionaryEncodedStringDruidPredicateIndex(bitmapFactory, dictionary, bitmaps); - } else if (clazz.equals(LexicographicalRangeIndex.class)) { - return (T) new GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex(bitmapFactory, dictionary, bitmaps); - } else if (clazz.equals(DictionaryEncodedStringValueIndex.class)) { + } else if (bitmaps != null && clazz.equals(LexicographicalRangeIndex.class)) { + return (T) new GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex( + bitmapFactory, + dictionaryUtf8, + bitmaps + ); + } else if (bitmaps != null && clazz.equals(DictionaryEncodedStringValueIndex.class)) { return (T) new GenericIndexedDictionaryEncodedStringValueIndex(bitmapFactory, dictionary, bitmaps); - } else if (clazz.equals(SpatialIndex.class)) { + } else if (indexedTree != null && clazz.equals(SpatialIndex.class)) { return (T) (SpatialIndex) () -> indexedTree; } return null; @@ -98,21 +113,21 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier } } - private abstract static class BaseGenericIndexedDictionaryEncodedStringIndex + private abstract static class BaseGenericIndexedDictionaryEncodedIndex { protected final BitmapFactory bitmapFactory; - protected final GenericIndexed dictionary; - protected final GenericIndexed bitmaps; + protected final Indexed dictionary; + protected final Indexed bitmaps; - protected BaseGenericIndexedDictionaryEncodedStringIndex( + protected BaseGenericIndexedDictionaryEncodedIndex( BitmapFactory bitmapFactory, - GenericIndexed dictionary, + GenericIndexed dictionary, GenericIndexed bitmaps ) { this.bitmapFactory = bitmapFactory; - this.dictionary = dictionary; - this.bitmaps = bitmaps; + this.dictionary = dictionary.singleThreaded(); + this.bitmaps = bitmaps.singleThreaded(); } public ImmutableBitmap getBitmap(int idx) @@ -127,7 +142,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier } public static final class GenericIndexedDictionaryEncodedStringValueIndex - extends BaseGenericIndexedDictionaryEncodedStringIndex implements DictionaryEncodedStringValueIndex + extends BaseGenericIndexedDictionaryEncodedIndex implements DictionaryEncodedStringValueIndex { public GenericIndexedDictionaryEncodedStringValueIndex( BitmapFactory bitmapFactory, @@ -165,12 +180,13 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier } public static final class GenericIndexedDictionaryEncodedStringValueSetIndex - extends BaseGenericIndexedDictionaryEncodedStringIndex implements StringValueSetIndex + extends BaseGenericIndexedDictionaryEncodedIndex implements StringValueSetIndex, Utf8ValueSetIndex { + private static final int SIZE_WORTH_CHECKING_MIN = 8; public GenericIndexedDictionaryEncodedStringValueSetIndex( BitmapFactory bitmapFactory, - GenericIndexed dictionary, + GenericIndexed dictionary, GenericIndexed bitmaps ) { @@ -197,14 +213,43 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier private ImmutableBitmap getBitmapForValue() { - final int idx = dictionary.indexOf(value); + final ByteBuffer valueUtf8 = value == null ? null : ByteBuffer.wrap(StringUtils.toUtf8(value)); + final int idx = dictionary.indexOf(valueUtf8); return getBitmap(idx); } }; } @Override - public BitmapColumnIndex forValues(Set values) + public BitmapColumnIndex forSortedValues(SortedSet values) + { + return getBitmapColumnIndexForSortedIterableUtf8( + Iterables.transform( + values, + input -> ByteBuffer.wrap(StringUtils.toUtf8(input)) + ) + ); + } + + @Override + public BitmapColumnIndex forSortedValuesUtf8(SortedSet valuesUtf8) + { + final SortedSet tailSet; + + if (valuesUtf8.size() >= SIZE_WORTH_CHECKING_MIN) { + final ByteBuffer minValueInColumn = dictionary.get(0); + tailSet = valuesUtf8.tailSet(minValueInColumn); + } else { + tailSet = valuesUtf8; + } + + return getBitmapColumnIndexForSortedIterableUtf8(tailSet); + } + + /** + * Helper used by {@link #forSortedValues} and {@link #forSortedValuesUtf8}. + */ + private BitmapColumnIndex getBitmapColumnIndexForSortedIterableUtf8(Iterable valuesUtf8) { return new DictionaryEncodedStringBitmapColumnIndex() { @@ -222,9 +267,11 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier private Iterable getBitmapsIterable() { + final int dictionarySize = dictionary.size(); + return () -> new Iterator() { - final Iterator iterator = values.iterator(); + final Iterator iterator = valuesUtf8.iterator(); int next = -1; @Override @@ -253,8 +300,15 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier private void findNext() { while (next < 0 && iterator.hasNext()) { - String nextValue = iterator.next(); + ByteBuffer nextValue = iterator.next(); next = dictionary.indexOf(nextValue); + + if (next == -dictionarySize - 1) { + // nextValue is past the end of the dictionary. + // Note: we can rely on indexOf returning (-(insertion point) - 1), even though Indexed doesn't + // guarantee it, because "dictionary" comes from GenericIndexed singleThreaded(). + break; + } } } }; @@ -264,7 +318,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier } public static final class GenericIndexedDictionaryEncodedStringDruidPredicateIndex - extends BaseGenericIndexedDictionaryEncodedStringIndex implements DruidPredicateIndex + extends BaseGenericIndexedDictionaryEncodedIndex implements DruidPredicateIndex { public GenericIndexedDictionaryEncodedStringDruidPredicateIndex( BitmapFactory bitmapFactory, @@ -350,12 +404,12 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier } public static final class GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex - extends BaseGenericIndexedDictionaryEncodedStringIndex implements LexicographicalRangeIndex + extends BaseGenericIndexedDictionaryEncodedIndex implements LexicographicalRangeIndex { public GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex( BitmapFactory bitmapFactory, - GenericIndexed dictionary, + GenericIndexed dictionary, GenericIndexed bitmaps ) { @@ -447,7 +501,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier private int findNext() { - while (currIndex < end && !matcher.apply(dictionary.get(currIndex))) { + while (currIndex < end && !applyMatcher(dictionary.get(currIndex))) { currIndex++; } @@ -478,16 +532,32 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier } }; } + + private boolean applyMatcher(@Nullable final ByteBuffer valueUtf8) + { + if (valueUtf8 == null) { + return matcher.apply(null); + } else { + // Duplicate buffer, because StringUtils.fromUtf8 advances the position, and we do not want to do that. + return matcher.apply(StringUtils.fromUtf8(valueUtf8.duplicate())); + } + } }; } - private IntIntPair getRange(@Nullable String startValue, boolean startStrict, @Nullable String endValue, boolean endStrict) + private IntIntPair getRange( + @Nullable String startValue, + boolean startStrict, + @Nullable String endValue, + boolean endStrict + ) { int startIndex, endIndex; if (startValue == null) { startIndex = 0; } else { - final int found = dictionary.indexOf(NullHandling.emptyToNullIfNeeded(startValue)); + final String startValueToUse = NullHandling.emptyToNullIfNeeded(startValue); + final int found = dictionary.indexOf(StringUtils.toUtf8ByteBuffer(startValueToUse)); if (found >= 0) { startIndex = startStrict ? found + 1 : found; } else { @@ -498,7 +568,8 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier if (endValue == null) { endIndex = dictionary.size(); } else { - final int found = dictionary.indexOf(NullHandling.emptyToNullIfNeeded(endValue)); + final String endValueToUse = NullHandling.emptyToNullIfNeeded(endValue); + final int found = dictionary.indexOf(StringUtils.toUtf8ByteBuffer(endValueToUse)); if (found >= 0) { endIndex = endStrict ? found : found + 1; } else { diff --git a/processing/src/main/java/org/apache/druid/segment/virtual/ListFilteredVirtualColumn.java b/processing/src/main/java/org/apache/druid/segment/virtual/ListFilteredVirtualColumn.java index f6bb879b66c..032542efd1b 100644 --- a/processing/src/main/java/org/apache/druid/segment/virtual/ListFilteredVirtualColumn.java +++ b/processing/src/main/java/org/apache/druid/segment/virtual/ListFilteredVirtualColumn.java @@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; +import com.google.common.base.Predicates; import org.apache.druid.collections.bitmap.ImmutableBitmap; import org.apache.druid.common.config.NullHandling; import org.apache.druid.query.BitmapResultFactory; @@ -60,6 +61,7 @@ import java.util.List; import java.util.NoSuchElementException; import java.util.Objects; import java.util.Set; +import java.util.SortedSet; /** * {@link VirtualColumn} form of {@link ListFilteredDimensionSpec}, powered by @@ -397,7 +399,7 @@ public class ListFilteredVirtualColumn implements VirtualColumn } @Override - public BitmapColumnIndex forValues(Set values) + public BitmapColumnIndex forSortedValues(SortedSet values) { return new BaseVirtualIndex() { @@ -505,6 +507,17 @@ public class ListFilteredVirtualColumn implements VirtualColumn super(delegate, idMapping); } + @Override + public BitmapColumnIndex forRange( + @Nullable String startValue, + boolean startStrict, + @Nullable String endValue, + boolean endStrict + ) + { + return forRange(startValue, startStrict, endValue, endStrict, Predicates.alwaysTrue()); + } + @Override public BitmapColumnIndex forRange( @Nullable String startValue, diff --git a/processing/src/test/java/org/apache/druid/query/filter/InDimFilterTest.java b/processing/src/test/java/org/apache/druid/query/filter/InDimFilterTest.java index 0c933b67e91..ddcf9785289 100644 --- a/processing/src/test/java/org/apache/druid/query/filter/InDimFilterTest.java +++ b/processing/src/test/java/org/apache/druid/query/filter/InDimFilterTest.java @@ -31,11 +31,20 @@ import org.apache.druid.jackson.DefaultObjectMapper; import org.apache.druid.query.extraction.RegexDimExtractionFn; import org.apache.druid.segment.RowAdapters; import org.apache.druid.segment.RowBasedColumnSelectorFactory; +import org.apache.druid.segment.column.BitmapColumnIndex; +import org.apache.druid.segment.column.ColumnIndexSupplier; import org.apache.druid.segment.column.ColumnType; import org.apache.druid.segment.column.RowSignature; +import org.apache.druid.segment.column.StringValueSetIndex; +import org.apache.druid.segment.column.Utf8ValueSetIndex; import org.apache.druid.testing.InitializedNullHandlingTest; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.mockito.Mockito; +import org.mockito.junit.MockitoJUnit; +import org.mockito.junit.MockitoRule; +import org.mockito.quality.Strictness; import java.io.IOException; import java.util.Arrays; @@ -52,6 +61,9 @@ public class InDimFilterTest extends InitializedNullHandlingTest private final String serializedFilter = "{\"type\":\"in\",\"dimension\":\"dimTest\",\"values\":[\"bad\",\"good\"]}"; + @Rule + public MockitoRule mockitoRule = MockitoJUnit.rule().strictness(Strictness.STRICT_STUBS); + @Test public void testDeserialization() throws IOException { @@ -71,21 +83,23 @@ public class InDimFilterTest extends InitializedNullHandlingTest @Test public void testGetValuesWithValuesSetOfNonEmptyStringsUseTheGivenSet() { - final Set values = ImmutableSet.of("v1", "v2", "v3"); - final InDimFilter filter = new InDimFilter("dim", values, null, null); + final Set values = new InDimFilter.ValuesSet(); + values.addAll(Arrays.asList("v1", "v2", "v3")); + final InDimFilter filter = new InDimFilter("dim", values); Assert.assertSame(values, filter.getValues()); } @Test public void testGetValuesWithValuesSetIncludingEmptyString() { - final Set values = Sets.newHashSet("v1", "", "v3"); - final InDimFilter filter = new InDimFilter("dim", values, null, null); + final InDimFilter.ValuesSet values = new InDimFilter.ValuesSet(ImmutableSet.of("v1", "", "v3")); + final InDimFilter filter = new InDimFilter("dim", values); if (NullHandling.replaceWithDefault()) { - Assert.assertNotSame(values, filter.getValues()); + Assert.assertSame(values, filter.getValues()); Assert.assertEquals(Sets.newHashSet("v1", null, "v3"), filter.getValues()); } else { Assert.assertSame(values, filter.getValues()); + Assert.assertEquals(Sets.newHashSet("v1", "", "v3"), filter.getValues()); } } @@ -234,4 +248,53 @@ public class InDimFilterTest extends InitializedNullHandlingTest // Now it *shouldn't* match. Assert.assertFalse(matcher.matches()); } + + @Test + public void testUsesUtf8SetIndex() + { + // An implementation test. + // This test confirms that "in" filters use utf8 index lookups when available. + + final Filter inFilter = new InDimFilter("dim0", ImmutableSet.of("v1", "v2")).toFilter(); + + final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class); + final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class); + final Utf8ValueSetIndex valueIndex = Mockito.mock(Utf8ValueSetIndex.class); + final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class); + + final InDimFilter.ValuesSet expectedValuesSet = new InDimFilter.ValuesSet(); + expectedValuesSet.addAll(Arrays.asList("v1", "v2")); + + Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier); + Mockito.when(indexSupplier.as(Utf8ValueSetIndex.class)).thenReturn(valueIndex); + Mockito.when(valueIndex.forSortedValuesUtf8(expectedValuesSet.toUtf8())).thenReturn(bitmapColumnIndex); + + final BitmapColumnIndex retVal = inFilter.getBitmapColumnIndex(indexSelector); + Assert.assertSame("inFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal); + } + + @Test + public void testUsesStringSetIndex() + { + // An implementation test. + // This test confirms that "in" filters use non-utf8 string index lookups when utf8 indexes are not available. + + final Filter inFilter = new InDimFilter("dim0", ImmutableSet.of("v1", "v2")).toFilter(); + + final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class); + final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class); + final StringValueSetIndex valueIndex = Mockito.mock(StringValueSetIndex.class); + final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class); + + final InDimFilter.ValuesSet expectedValuesSet = new InDimFilter.ValuesSet(); + expectedValuesSet.addAll(Arrays.asList("v1", "v2")); + + Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier); + Mockito.when(indexSupplier.as(Utf8ValueSetIndex.class)).thenReturn(null); // Will check for UTF-8 first. + Mockito.when(indexSupplier.as(StringValueSetIndex.class)).thenReturn(valueIndex); + Mockito.when(valueIndex.forSortedValues(expectedValuesSet)).thenReturn(bitmapColumnIndex); + + final BitmapColumnIndex retVal = inFilter.getBitmapColumnIndex(indexSelector); + Assert.assertSame("inFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal); + } } diff --git a/processing/src/test/java/org/apache/druid/query/filter/LikeDimFilterTest.java b/processing/src/test/java/org/apache/druid/query/filter/LikeDimFilterTest.java index ef1a97005cb..c51ec817b73 100644 --- a/processing/src/test/java/org/apache/druid/query/filter/LikeDimFilterTest.java +++ b/processing/src/test/java/org/apache/druid/query/filter/LikeDimFilterTest.java @@ -24,15 +24,27 @@ import com.google.common.collect.Sets; import nl.jqno.equalsverifier.EqualsVerifier; import org.apache.druid.jackson.DefaultObjectMapper; import org.apache.druid.query.extraction.SubstringDimExtractionFn; +import org.apache.druid.segment.column.BitmapColumnIndex; +import org.apache.druid.segment.column.ColumnIndexSupplier; +import org.apache.druid.segment.column.LexicographicalRangeIndex; +import org.apache.druid.segment.column.StringValueSetIndex; import org.apache.druid.testing.InitializedNullHandlingTest; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; +import org.mockito.Mockito; +import org.mockito.junit.MockitoJUnit; +import org.mockito.junit.MockitoRule; +import org.mockito.quality.Strictness; import java.io.IOException; import java.util.Arrays; public class LikeDimFilterTest extends InitializedNullHandlingTest { + @Rule + public MockitoRule mockitoRule = MockitoJUnit.rule().strictness(Strictness.STRICT_STUBS); + @Test public void testSerde() throws IOException { @@ -88,4 +100,49 @@ public class LikeDimFilterTest extends InitializedNullHandlingTest .withNonnullFields("suffixMatch", "prefix", "pattern") .verify(); } + + @Test + public void testPrefixMatchUsesRangeIndex() + { + // An implementation test. + // This test confirms that "like" filters with prefix matchers use index-range lookups without matcher predicates. + + final Filter likeFilter = new LikeDimFilter("dim0", "f%", null, null, null).toFilter(); + + final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class); + final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class); + final LexicographicalRangeIndex rangeIndex = Mockito.mock(LexicographicalRangeIndex.class); + final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class); + + Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier); + Mockito.when(indexSupplier.as(LexicographicalRangeIndex.class)).thenReturn(rangeIndex); + Mockito.when( + // Verify that likeFilter uses forRange without a matcher predicate; it's unnecessary and slows things down + rangeIndex.forRange("f", false, "f" + Character.MAX_VALUE, false) + ).thenReturn(bitmapColumnIndex); + + final BitmapColumnIndex retVal = likeFilter.getBitmapColumnIndex(indexSelector); + Assert.assertSame("likeFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal); + } + + @Test + public void testExactMatchUsesValueIndex() + { + // An implementation test. + // This test confirms that "like" filters with exact matchers use index lookups. + + final Filter likeFilter = new LikeDimFilter("dim0", "f", null, null, null).toFilter(); + + final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class); + final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class); + final StringValueSetIndex valueIndex = Mockito.mock(StringValueSetIndex.class); + final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class); + + Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier); + Mockito.when(indexSupplier.as(StringValueSetIndex.class)).thenReturn(valueIndex); + Mockito.when(valueIndex.forValue("f")).thenReturn(bitmapColumnIndex); + + final BitmapColumnIndex retVal = likeFilter.getBitmapColumnIndex(indexSelector); + Assert.assertSame("likeFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal); + } } diff --git a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java index c37996d0351..268a4a2c971 100644 --- a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java +++ b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java @@ -26,6 +26,7 @@ import org.apache.druid.collections.bitmap.ConciseBitmapFactory; import org.apache.druid.collections.bitmap.ImmutableBitmap; import org.apache.druid.collections.bitmap.MutableBitmap; import org.apache.druid.collections.bitmap.RoaringBitmapFactory; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.extraction.DimExtractionFn; import org.apache.druid.query.extraction.ExtractionFn; import org.apache.druid.query.filter.ColumnIndexSelector; @@ -49,6 +50,7 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import javax.annotation.Nullable; +import java.nio.ByteBuffer; import java.util.Collections; import java.util.Map; @@ -117,6 +119,10 @@ public class ExtractionDimFilterTest extends InitializedNullHandlingTest return new DictionaryEncodedStringIndexSupplier( factory, GenericIndexed.fromIterable(Collections.singletonList("foo1"), GenericIndexed.STRING_STRATEGY), + GenericIndexed.fromIterable( + Collections.singletonList(ByteBuffer.wrap(StringUtils.toUtf8("foo1"))), + GenericIndexed.BYTE_BUFFER_STRATEGY + ), GenericIndexed.fromIterable(Collections.singletonList(foo1BitMap), serdeFactory.getObjectStrategy()), null ); diff --git a/processing/src/test/java/org/apache/druid/segment/filter/InFilterTest.java b/processing/src/test/java/org/apache/druid/segment/filter/InFilterTest.java index 0d02cd9b1f7..7981fdf3d52 100644 --- a/processing/src/test/java/org/apache/druid/segment/filter/InFilterTest.java +++ b/processing/src/test/java/org/apache/druid/segment/filter/InFilterTest.java @@ -386,7 +386,7 @@ public class InFilterTest extends BaseFilterTest EqualsVerifier.forClass(InDimFilter.class) .usingGetClass() .withNonnullFields("dimension", "values") - .withIgnoredFields("cacheKeySupplier", "predicateFactory", "cachedOptimizedFilter") + .withIgnoredFields("cacheKeySupplier", "predicateFactory", "cachedOptimizedFilter", "valuesUtf8") .verify(); } diff --git a/processing/src/test/java/org/apache/druid/segment/join/JoinFilterAnalyzerTest.java b/processing/src/test/java/org/apache/druid/segment/join/JoinFilterAnalyzerTest.java index 813db7d4cda..b6fd9f4f0e0 100644 --- a/processing/src/test/java/org/apache/druid/segment/join/JoinFilterAnalyzerTest.java +++ b/processing/src/test/java/org/apache/druid/segment/join/JoinFilterAnalyzerTest.java @@ -242,7 +242,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#en.wikipedia"), - new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter() ) ), new SelectorFilter("rtc.countryName", "United States"), @@ -649,7 +649,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes )) ) ), - new InDimFilter("countryIsoCode", ImmutableSet.of("CA"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("CA")).toFilter() ) ), new AndFilter( @@ -751,7 +751,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#en.wikipedia"), - new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("SU"), null, null).toFilter() + new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("SU")).toFilter() ) ), new SelectorFilter("rtc.countryName", "States United"), @@ -898,7 +898,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new OrFilter( ImmutableList.of( new SelectorFilter("channel", "#ko.wikipedia"), - new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter() ) ), new OrFilter( @@ -906,8 +906,8 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new SelectorFilter("channel", "#ko.wikipedia"), new AndFilter( ImmutableList.of( - new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter(), - new InDimFilter("regionIsoCode", ImmutableSet.of("VA"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter(), + new InDimFilter("regionIsoCode", ImmutableSet.of("VA")).toFilter() ) ) ) @@ -1004,7 +1004,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#en.wikipedia"), - new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA"), null, null).toFilter() + new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA")).toFilter() ) ), new SelectorFilter("c1.countryName", "Usca"), @@ -1094,7 +1094,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#en.wikipedia"), - new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA"), null, null).toFilter() + new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA")).toFilter() ) ), new SelectorFilter("c1.v", "Usca"), @@ -1164,7 +1164,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#de.wikipedia"), - new InDimFilter("countryIsoCode", ImmutableSet.of("DE"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("DE")).toFilter() ) ), new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "countryName", "Germany"), @@ -1221,7 +1221,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#de.wikipedia"), - new InDimFilter("countryIsoCode", ImmutableSet.of("DE"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("DE")).toFilter() ) ), new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "v", "Germany"), @@ -1453,7 +1453,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#en.wikipedia"), - new InDimFilter("countryNumber", ImmutableSet.of("0"), null, null).toFilter() + new InDimFilter("countryNumber", ImmutableSet.of("0")).toFilter() ) ), new SelectorFilter(FACT_TO_COUNTRY_ON_NUMBER_PREFIX + "countryName", "Australia"), @@ -1516,7 +1516,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#en.wikipedia"), - new InDimFilter("countryNumber", ImmutableSet.of("0"), null, null).toFilter() + new InDimFilter("countryNumber", ImmutableSet.of("0")).toFilter() ) ), new SelectorFilter(FACT_TO_COUNTRY_ON_NUMBER_PREFIX + "v", "Australia"), @@ -1683,7 +1683,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#es.wikipedia"), - new InDimFilter("countryIsoCode", ImmutableSet.of("SV"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("SV")).toFilter() ) ), new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "countryName", "El Salvador"), @@ -1740,7 +1740,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new AndFilter( ImmutableList.of( new SelectorFilter("channel", "#es.wikipedia"), - new InDimFilter("countryIsoCode", ImmutableSet.of("SV"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("SV")).toFilter() ) ), new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "v", "El Salvador"), @@ -1916,8 +1916,8 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes JoinFilterSplit expectedFilterSplit = new JoinFilterSplit( new AndFilter( ImmutableList.of( - new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM"), null, null).toFilter(), - new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM")).toFilter(), + new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM")).toFilter() ) ), new SelectorFilter("r1.regionName", "Fourems Province"), @@ -1987,7 +1987,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes JoinFilterSplit expectedFilterSplit = new JoinFilterSplit( new OrFilter( ImmutableList.of( - new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM"), null, null).toFilter(), + new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM")).toFilter(), new SelectorFilter("regionIsoCode", "AAAA") ) ), @@ -2288,7 +2288,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes null )), expressionFilter, - new InDimFilter("rtc.countryIsoCode", ImmutableSet.of("CA", "CA2", "CA3"), null, null).toFilter(), + new InDimFilter("rtc.countryIsoCode", ImmutableSet.of("CA", "CA2", "CA3")).toFilter(), new OrFilter( ImmutableList.of( new SelectorFilter("channel", "#fr.wikipedia"), @@ -2414,7 +2414,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes ImmutableList.of( new SelectorFilter(rewrittenRegionIsoCodeColumnName, "ON"), new SelectorFilter(rewrittenCountryIsoCodeColumnName, "CA"), - new InDimFilter(rewrittenCountryIsoCodeColumnName, ImmutableSet.of("CA"), null, null).toFilter(), + new InDimFilter(rewrittenCountryIsoCodeColumnName, ImmutableSet.of("CA")).toFilter(), new BoundFilter(new BoundDimFilter( rewrittenCountryIsoCodeColumnName, "CA", @@ -2427,9 +2427,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes )), new InDimFilter( rewrittenCountryIsoCodeColumnName, - ImmutableSet.of("CA", "CA2", "CA3"), - null, - null + ImmutableSet.of("CA", "CA2", "CA3") ).toFilter(), new OrFilter( ImmutableList.of( @@ -2453,9 +2451,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes new SelectorFilter(rewrittenCountryIsoCodeColumnName, "ABCDEF"), new InDimFilter( rewrittenCountryIsoCodeColumnName, - ImmutableSet.of("CA"), - null, - null + ImmutableSet.of("CA") ).toFilter(), new BoundFilter(new BoundDimFilter( rewrittenCountryIsoCodeColumnName, @@ -2549,10 +2545,10 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes JoinFilterSplit expectedFilterSplit = new JoinFilterSplit( new AndFilter( ImmutableList.of( - new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter(), - new InDimFilter("regionIsoCode", ImmutableSet.of("CA"), null, null).toFilter(), - new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM", "AAAA"), null, null).toFilter(), - new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM", "AAAA"), null, null).toFilter() + new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter(), + new InDimFilter("regionIsoCode", ImmutableSet.of("CA")).toFilter(), + new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM", "AAAA")).toFilter(), + new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM", "AAAA")).toFilter() ) ), originalFilter, diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/ArrayOverlapOperatorConversion.java b/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/ArrayOverlapOperatorConversion.java index 78edea3a617..dbb5ab4fbfa 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/ArrayOverlapOperatorConversion.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/expression/builtin/ArrayOverlapOperatorConversion.java @@ -19,7 +19,6 @@ package org.apache.druid.sql.calcite.expression.builtin; -import com.google.common.collect.Sets; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexNode; import org.apache.calcite.sql.SqlFunction; @@ -40,6 +39,7 @@ import org.apache.druid.sql.calcite.planner.PlannerContext; import org.apache.druid.sql.calcite.rel.VirtualColumnRegistry; import javax.annotation.Nullable; +import java.util.Arrays; import java.util.List; public class ArrayOverlapOperatorConversion extends BaseExpressionDimFilterOperatorConversion @@ -123,7 +123,7 @@ public class ArrayOverlapOperatorConversion extends BaseExpressionDimFilterOpera } else { return new InDimFilter( simpleExtractionExpr.getSimpleExtraction().getColumn(), - Sets.newHashSet(arrayElements), + new InDimFilter.ValuesSet(Arrays.asList(arrayElements)), simpleExtractionExpr.getSimpleExtraction().getExtractionFn(), null ); diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/filtration/ConvertSelectorsToIns.java b/sql/src/main/java/org/apache/druid/sql/calcite/filtration/ConvertSelectorsToIns.java index 631fd937766..29ea3712989 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/filtration/ConvertSelectorsToIns.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/filtration/ConvertSelectorsToIns.java @@ -20,7 +20,6 @@ package org.apache.druid.sql.calcite.filtration; import com.google.common.collect.Lists; -import com.google.common.collect.Sets; import org.apache.druid.java.util.common.ISE; import org.apache.druid.query.filter.DimFilter; import org.apache.druid.query.filter.InDimFilter; @@ -34,7 +33,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Set; public class ConvertSelectorsToIns extends BottomUpTransform { @@ -80,7 +78,7 @@ public class ConvertSelectorsToIns extends BottomUpTransform final List filterList = entry.getValue(); if (filterList.size() > 1) { // We found a simplification. Remove the old filters and add new ones. - final Set values = Sets.newHashSetWithExpectedSize(filterList.size()); + final InDimFilter.ValuesSet values = new InDimFilter.ValuesSet(); for (final SelectorDimFilter selector : filterList) { values.add(selector.getValue());