Direct UTF-8 access for "in" filters. (#12517)

* Direct UTF-8 access for "in" filters.

Directly related:

1) InDimFilter: Store stored Strings (in ValuesSet) plus sorted UTF-8
   ByteBuffers (in valuesUtf8). Use valuesUtf8 whenever possible. If
   necessary, the input set is copied into a ValuesSet. Much logic is
   simplified, because we always know what type the values set will be.
   I think that there won't even be an efficiency loss in most cases.
   InDimFilter is most frequently created by deserialization, and this
   patch updates the JsonCreator constructor to deserialize
   directly into a ValuesSet.

2) Add Utf8ValueSetIndex, which InDimFilter uses to avoid UTF-8 decodes
   during index lookups.

3) Add unsigned comparator to ByteBufferUtils and use it in
   GenericIndexed.BYTE_BUFFER_STRATEGY. This is important because UTF-8
   bytes can be compared as bytes if, and only if, the comparison
   is unsigned.

4) Add specialization to GenericIndexed.singleThreaded().indexOf that
   avoids needless ByteBuffer allocations.

5) Clarify that objects returned by ColumnIndexSupplier.as are not
   thread-safe. DictionaryEncodedStringIndexSupplier now calls
   singleThreaded() on all relevant GenericIndexed objects, saving
   a ByteBuffer allocation per access.

Also:

1) Fix performance regression in LikeFilter: since #12315, it applied
   the suffix matcher to all values in range even for type MATCH_ALL.

2) Add ObjectStrategy.canCompare() method. This fixes LikeFilterBenchmark,
   which was broken due to calls to strategy.compare in
   GenericIndexed.fromIterable.

* Add like-filter implementation tests.

* Add in-filter implementation tests.

* Add tests, fix issues.

* Fix style.

* Adjustments from review.
This commit is contained in:
Gian Merlino 2022-05-20 01:51:28 -07:00 committed by GitHub
parent a235aca2b3
commit 69aac6c8dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
37 changed files with 888 additions and 220 deletions

View File

@ -27,6 +27,7 @@ import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.collections.bitmap.RoaringBitmapFactory;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.extendedset.intset.ConciseSetUtils;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.filter.BoundDimFilter;
import org.apache.druid.query.filter.ColumnIndexSelector;
import org.apache.druid.query.ordering.StringComparators;
@ -48,6 +49,7 @@ import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
@ -160,7 +162,7 @@ public class BoundFilterBenchmark
final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null);
final List<Integer> ints = generateInts();
final GenericIndexed<String> dictionary = GenericIndexed.fromIterable(
FluentIterable.from(ints).transform(i -> i.toString()),
FluentIterable.from(ints).transform(Object::toString),
GenericIndexed.STRING_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
@ -174,9 +176,14 @@ public class BoundFilterBenchmark
),
serdeFactory.getObjectStrategy()
);
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
);
selector = new MockColumnIndexSelector(
bitmapFactory,
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, bitmaps, null)
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null)
);
}

View File

@ -19,7 +19,6 @@
package org.apache.druid.benchmark;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
@ -28,6 +27,7 @@ import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.collections.bitmap.RoaringBitmapFactory;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.filter.ColumnIndexSelector;
import org.apache.druid.query.filter.DruidDoublePredicate;
import org.apache.druid.query.filter.DruidFloatPredicate;
@ -51,6 +51,7 @@ import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
@ -123,18 +124,14 @@ public class DimensionPredicateFilterBenchmark
final List<Integer> ints = generateInts();
final GenericIndexed<String> dictionary = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(
new Function<Integer, String>()
{
@Override
public String apply(Integer i)
{
return i.toString();
}
}
),
.transform(Object::toString),
GenericIndexed.STRING_STRATEGY
);
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(
@ -148,7 +145,7 @@ public class DimensionPredicateFilterBenchmark
);
selector = new MockColumnIndexSelector(
bitmapFactory,
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, bitmaps, null)
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null)
);
}

View File

@ -0,0 +1,133 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.benchmark;
import com.google.common.collect.FluentIterable;
import org.apache.druid.collections.bitmap.BitmapFactory;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.collections.bitmap.RoaringBitmapFactory;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.filter.ColumnIndexSelector;
import org.apache.druid.query.filter.InDimFilter;
import org.apache.druid.segment.data.BitmapSerdeFactory;
import org.apache.druid.segment.data.GenericIndexed;
import org.apache.druid.segment.data.RoaringBitmapSerdeFactory;
import org.apache.druid.segment.filter.Filters;
import org.apache.druid.segment.serde.DictionaryEncodedStringIndexSupplier;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
import java.nio.ByteBuffer;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@State(Scope.Benchmark)
@Fork(value = 1)
@Warmup(iterations = 10)
@Measurement(iterations = 10)
public class InFilterBenchmark
{
static {
NullHandling.initializeForTests();
}
private static final int START_INT = 10_000_000;
private InDimFilter inFilter;
// cardinality of the dictionary. it will contain this many ints (as strings, of course), starting at START_INT,
// even numbers only.
@Param({"1000000"})
int dictionarySize;
// cardinality of the "in" filter. half of its values will be in the dictionary, half will not.
@Param({"10000"})
int filterSize;
// selector will contain a "dictionarySize" number of bitmaps; each one contains a single int.
// this benchmark is not about bitmap union speed, so no need for that part to be realistic.
ColumnIndexSelector selector;
@Setup
public void setup()
{
final BitmapFactory bitmapFactory = new RoaringBitmapFactory();
final BitmapSerdeFactory serdeFactory = new RoaringBitmapSerdeFactory(null);
final Iterable<Integer> ints = intGenerator();
final GenericIndexed<String> dictionary = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(Object::toString),
GenericIndexed.STRING_STRATEGY
);
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
() -> IntStream.range(0, dictionarySize)
.mapToObj(
i -> {
final MutableBitmap mutableBitmap = bitmapFactory.makeEmptyMutableBitmap();
mutableBitmap.add(i);
return bitmapFactory.makeImmutableBitmap(mutableBitmap);
}
)
.iterator(),
serdeFactory.getObjectStrategy()
);
selector = new MockColumnIndexSelector(
bitmapFactory,
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null)
);
inFilter = new InDimFilter(
"dummy",
IntStream.range(START_INT, START_INT + filterSize).mapToObj(String::valueOf).collect(Collectors.toSet())
);
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public void doFilter(Blackhole blackhole)
{
final ImmutableBitmap bitmapIndex = Filters.computeDefaultBitmapResults(inFilter, selector);
blackhole.consume(bitmapIndex);
}
private Iterable<Integer> intGenerator()
{
// i * 2 => half of these values will be present in the inFilter, half won't.
return () -> IntStream.range(0, dictionarySize).map(i -> START_INT + i * 2).boxed().iterator();
}
}

View File

@ -25,6 +25,7 @@ import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.collections.bitmap.RoaringBitmapFactory;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.filter.BoundDimFilter;
import org.apache.druid.query.filter.ColumnIndexSelector;
import org.apache.druid.query.filter.Filter;
@ -50,6 +51,7 @@ import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
@ -122,11 +124,14 @@ public class LikeFilterBenchmark
final List<Integer> ints = generateInts();
final GenericIndexed<String> dictionary = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(
i -> i.toString()
),
.transform(Object::toString),
GenericIndexed.STRING_STRATEGY
);
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(
@ -140,7 +145,7 @@ public class LikeFilterBenchmark
);
selector = new MockColumnIndexSelector(
bitmapFactory,
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, bitmaps, null)
new DictionaryEncodedStringIndexSupplier(bitmapFactory, dictionary, dictionaryUtf8, bitmaps, null)
);
}

View File

@ -23,12 +23,14 @@ import org.apache.druid.collections.ResourceHolder;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.utils.JvmUtils;
import javax.annotation.Nullable;
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.lang.reflect.Method;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.util.Comparator;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicBoolean;
@ -48,6 +50,8 @@ public class ByteBufferUtils
// null if unmap is supported
private static final RuntimeException UNMAP_NOT_SUPPORTED_EXCEPTION;
private static final Comparator<ByteBuffer> COMPARATOR_UNSIGNED = new UnsignedByteBufferComparator();
static {
Object unmap = null;
RuntimeException exception = null;
@ -211,4 +215,79 @@ public class ByteBufferUtils
{
free(buffer);
}
/**
* Compares two ByteBuffer ranges using unsigned byte ordering.
*
* Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
*/
public static int compareByteBuffers(
final ByteBuffer buf1,
final int position1,
final int length1,
final ByteBuffer buf2,
final int position2,
final int length2
)
{
final int commonLength = Math.min(length1, length2);
for (int i = 0; i < commonLength; i++) {
final byte byte1 = buf1.get(position1 + i);
final byte byte2 = buf2.get(position2 + i);
final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
if (cmp != 0) {
return cmp;
}
}
return Integer.compare(length1, length2);
}
/**
* Compares two ByteBuffers from their positions to their limits using unsigned byte ordering. Accepts null
* buffers, which are ordered earlier than any nonnull buffer.
*
* Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
*/
public static int compareByteBuffers(
@Nullable final ByteBuffer buf1,
@Nullable final ByteBuffer buf2
)
{
if (buf1 == null) {
return buf2 == null ? 0 : -1;
}
if (buf2 == null) {
return 1;
}
return ByteBufferUtils.compareByteBuffers(
buf1,
buf1.position(),
buf1.remaining(),
buf2,
buf2.position(),
buf2.remaining()
);
}
/**
* Comparator that compares two {@link ByteBuffer} using unsigned ordering. Null buffers are accepted, and
* are ordered earlier than any nonnull buffer.
*/
public static Comparator<ByteBuffer> unsignedComparator()
{
return COMPARATOR_UNSIGNED;
}
private static class UnsignedByteBufferComparator implements Comparator<ByteBuffer>
{
@Override
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
{
return ByteBufferUtils.compareByteBuffers(o1, o2);
}
}
}

View File

@ -108,6 +108,11 @@ public class StringUtils
return StringUtils.fromUtf8(buffer, buffer.remaining());
}
/**
* Converts a string to a UTF-8 byte array.
*
* @throws NullPointerException if "string" is null
*/
public static byte[] toUtf8(final String string)
{
try {
@ -119,6 +124,16 @@ public class StringUtils
}
}
/**
* Converts a string to UTF-8 bytes, returning them as a newly-allocated on-heap {@link ByteBuffer}.
* If "string" is null, returns null.
*/
@Nullable
public static ByteBuffer toUtf8ByteBuffer(@Nullable final String string)
{
return string == null ? null : ByteBuffer.wrap(toUtf8(string));
}
/**
* Encodes "string" into the buffer "byteBuffer", using no more than the number of bytes remaining in the buffer.
* Will only encode whole characters. The byteBuffer's position and limit may be changed during operation, but will

View File

@ -23,7 +23,10 @@ import org.apache.druid.java.util.common.StringUtils;
import org.junit.Assert;
import org.junit.Test;
import java.nio.ByteBuffer;
/**
*
*/
public class StringUtilsTest
{
@ -76,4 +79,12 @@ public class StringUtilsTest
Assert.assertEquals(4, StringUtils.estimatedBinaryLengthAsUTF8(invalid));
}
@Test
public void testToUtf8ByteBuffer()
{
Assert.assertNull(StringUtils.toUtf8ByteBuffer(null));
Assert.assertEquals(ByteBuffer.allocate(0), StringUtils.toUtf8ByteBuffer(""));
Assert.assertEquals(ByteBuffer.wrap(StringUtils.toUtf8("foo")), StringUtils.toUtf8ByteBuffer("foo"));
Assert.assertEquals(ByteBuffer.wrap(StringUtils.toUtf8("🙂")), StringUtils.toUtf8ByteBuffer("🙂"));
}
}

View File

@ -21,6 +21,8 @@ package org.apache.druid.java.util.common;
import com.google.common.io.Files;
import org.apache.druid.collections.ResourceHolder;
import org.hamcrest.MatcherAssert;
import org.hamcrest.Matchers;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
@ -33,6 +35,7 @@ import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.util.Arrays;
import java.util.Comparator;
public class ByteBufferUtilsTest
{
@ -76,4 +79,53 @@ public class ByteBufferUtilsTest
final ByteBuffer heapBuffer = ByteBuffer.allocate(4096);
ByteBufferUtils.free(heapBuffer);
}
@Test
@SuppressWarnings("EqualsWithItself")
public void testUnsignedComparator()
{
final Comparator<ByteBuffer> comparator = ByteBufferUtils.unsignedComparator();
// Tests involving null
MatcherAssert.assertThat(comparator.compare(null, null), Matchers.equalTo(0));
MatcherAssert.assertThat(comparator.compare(null, ByteBuffer.allocate(0)), Matchers.lessThan(0));
MatcherAssert.assertThat(comparator.compare(ByteBuffer.allocate(0), null), Matchers.greaterThan(0));
MatcherAssert.assertThat(comparator.compare(null, ByteBuffer.allocate(1)), Matchers.lessThan(0));
MatcherAssert.assertThat(comparator.compare(ByteBuffer.allocate(1), null), Matchers.greaterThan(0));
MatcherAssert.assertThat(comparator.compare(null, ByteBuffer.wrap(new byte[]{-1})), Matchers.lessThan(0));
MatcherAssert.assertThat(comparator.compare(ByteBuffer.wrap(new byte[]{-1}), null), Matchers.greaterThan(0));
// Tests involving buffers of different lengths
MatcherAssert.assertThat(
comparator.compare(
ByteBuffer.wrap(new byte[]{1, 2, 3}),
ByteBuffer.wrap(new byte[]{1, 2, 3, 4})
),
Matchers.lessThan(0)
);
MatcherAssert.assertThat(
comparator.compare(
ByteBuffer.wrap(new byte[]{1, 2, 3, 4}),
ByteBuffer.wrap(new byte[]{1, 2, 3})
),
Matchers.greaterThan(0)
);
// Tests involving the full range of bytes
for (byte i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) {
for (byte j = Byte.MIN_VALUE; j < Byte.MAX_VALUE; j++) {
final int cmp = Integer.compare(Byte.toUnsignedInt(i), Byte.toUnsignedInt(j));
MatcherAssert.assertThat(
StringUtils.format("comparison of %s to %s", Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)),
comparator.compare(
ByteBuffer.wrap(new byte[]{i}),
ByteBuffer.wrap(new byte[]{j})
),
cmp < 0 ? Matchers.lessThan(0) : cmp > 0 ? Matchers.greaterThan(0) : Matchers.equalTo(0)
);
}
}
}
}

View File

@ -210,7 +210,7 @@ public class Druids
{
final Set<String> filterValues = Sets.newHashSet(values);
filterValues.add(value);
dimFilter = new InDimFilter(dimensionName, filterValues, null, null);
dimFilter = new InDimFilter(dimensionName, filterValues);
return this;
}

View File

@ -29,11 +29,11 @@ import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
import com.google.common.collect.ForwardingSortedSet;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeRangeSet;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
@ -42,6 +42,7 @@ import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.ByteBufferUtils;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.guava.Comparators;
@ -59,25 +60,26 @@ import org.apache.druid.segment.DimensionHandlerUtils;
import org.apache.druid.segment.column.BitmapColumnIndex;
import org.apache.druid.segment.column.ColumnIndexSupplier;
import org.apache.druid.segment.column.StringValueSetIndex;
import org.apache.druid.segment.column.Utf8ValueSetIndex;
import org.apache.druid.segment.filter.Filters;
import org.apache.druid.segment.vector.VectorColumnSelectorFactory;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
{
// Values can contain `null` object
private final Set<String> values;
// Values can contain `null` object. Values are sorted (nulls-first).
private final ValuesSet values;
// Computed eagerly, not lazily, because lazy computations would block all processing threads for a given query.
private final SortedSet<ByteBuffer> valuesUtf8;
private final String dimension;
@Nullable
private final ExtractionFn extractionFn;
@ -103,7 +105,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
@JsonProperty("dimension") String dimension,
// This 'values' collection instance can be reused if possible to avoid copying a big collection.
// Callers should _not_ modify the collection after it is passed to this constructor.
@JsonProperty("values") Set<String> values,
@JsonProperty("values") ValuesSet values,
@JsonProperty("extractionFn") @Nullable ExtractionFn extractionFn,
@JsonProperty("filterTuning") @Nullable FilterTuning filterTuning
)
@ -121,32 +123,35 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
* Creates a new filter without an extraction function or any special filter tuning.
*
* @param dimension column to search
* @param values set of values to match. This collection may be reused to avoid copying a big collection.
* Therefore, callers should <b>not</b> modify the collection after it is passed to this
* constructor.
* @param values set of values to match. If this collection is a {@link SortedSet}, it may be reused to avoid
* copying a big collection. Therefore, callers should <b>not</b> modify the collection after it
* is passed to this constructor.
*/
public InDimFilter(
String dimension,
Set<String> values
)
public InDimFilter(String dimension, Set<String> values)
{
this(
dimension,
values,
values instanceof ValuesSet ? (ValuesSet) values : new ValuesSet(values),
null,
null,
null
);
}
/**
* This constructor should be called only in unit tests since accepting a Collection makes copying more likely.
* Creates a new filter without an extraction function or any special filter tuning.
*
* @param dimension column to search
* @param values set of values to match. If this collection is a {@link SortedSet}, it may be reused to avoid
* copying a big collection. Therefore, callers should <b>not</b> modify the collection after it
* is passed to this constructor.
* @param extractionFn extraction function to apply to the column before checking against "values"
*/
@VisibleForTesting
public InDimFilter(String dimension, Collection<String> values, @Nullable ExtractionFn extractionFn)
{
this(
dimension,
values instanceof Set ? (Set<String>) values : new HashSet<>(values),
values instanceof ValuesSet ? (ValuesSet) values : new ValuesSet(values),
extractionFn,
null,
null
@ -158,7 +163,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
*/
private InDimFilter(
final String dimension,
final Set<String> values,
final ValuesSet values,
@Nullable final ExtractionFn extractionFn,
@Nullable final FilterTuning filterTuning,
@Nullable final DruidPredicateFactory predicateFactory
@ -166,19 +171,15 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
{
Preconditions.checkNotNull(values, "values cannot be null");
// The values set can be huge. Try to avoid copying the set if possible.
// Note that we may still need to copy values to a list for caching. See getCacheKey().
this.values = values;
if (!NullHandling.sqlCompatible() && values.contains("")) {
// In Non sql compatible mode, empty strings should be converted to nulls for the filter.
// In sql compatible mode, empty strings and nulls should be treated differently
this.values = Sets.newHashSetWithExpectedSize(values.size());
for (String v : values) {
this.values.add(NullHandling.emptyToNullIfNeeded(v));
}
} else {
this.values = values;
// In non-SQL-compatible mode, empty strings must be converted to nulls for the filter.
this.values.remove("");
this.values.add(null);
}
this.valuesUtf8 = this.values.toUtf8();
this.dimension = Preconditions.checkNotNull(dimension, "dimension cannot be null");
this.extractionFn = extractionFn;
this.filterTuning = filterTuning;
@ -199,7 +200,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
}
@JsonProperty
public Set<String> getValues()
public SortedSet<String> getValues()
{
return values;
}
@ -296,9 +297,15 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
selector
);
}
final StringValueSetIndex valueSetIndex = indexSupplier.as(StringValueSetIndex.class);
if (valueSetIndex != null) {
return valueSetIndex.forValues(values);
final Utf8ValueSetIndex utf8ValueSetIndex = indexSupplier.as(Utf8ValueSetIndex.class);
if (utf8ValueSetIndex != null) {
return utf8ValueSetIndex.forSortedValuesUtf8(valuesUtf8);
}
final StringValueSetIndex stringValueSetIndex = indexSupplier.as(StringValueSetIndex.class);
if (stringValueSetIndex != null) {
return stringValueSetIndex.forSortedValues(values);
}
}
return Filters.makePredicateIndex(
@ -399,20 +406,9 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
private byte[] computeCacheKey()
{
final Collection<String> sortedValues;
if (values instanceof SortedSet && isNaturalOrder(((SortedSet<String>) values).comparator())) {
// Avoid copying "values" when it is already in the order we need for cache key computation.
sortedValues = values;
} else {
final List<String> sortedValuesList = new ArrayList<>(values);
sortedValuesList.sort(Comparators.naturalNullsFirst());
sortedValues = sortedValuesList;
}
// Hash all values, in sorted order, as their length followed by their content.
final Hasher hasher = Hashing.sha256().newHasher();
for (String v : sortedValues) {
for (String v : values) {
if (v == null) {
// Encode null as length -1, no content.
hasher.putInt(-1);
@ -438,7 +434,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
LookupExtractionFn exFn = (LookupExtractionFn) extractionFn;
LookupExtractor lookup = exFn.getLookup();
final Set<String> keys = new HashSet<>();
final ValuesSet keys = new ValuesSet();
for (String value : values) {
// We cannot do an unapply()-based optimization if the selector value
@ -468,40 +464,11 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
return this;
}
/**
* Returns true if the comparator is null or the singleton {@link Comparators#naturalNullsFirst()}. Useful for
* detecting if a sorted set is in natural order or not.
*
* May return false negatives (i.e. there are naturally-ordered comparators that will return false here).
*/
private static <T> boolean isNaturalOrder(@Nullable final Comparator<T> comparator)
{
return comparator == null || Comparators.naturalNullsFirst().equals(comparator);
}
@SuppressWarnings("ReturnValueIgnored")
private static Predicate<String> createStringPredicate(final Set<String> values)
{
Preconditions.checkNotNull(values, "values");
try {
// Check to see if values.contains(null) will throw a NullPointerException. Jackson JSON deserialization won't
// lead to this (it will create a HashSet, which can accept nulls). But when InDimFilters are created
// programmatically as a result of optimizations like rewriting inner joins as filters, the passed-in Set may
// not be able to accept nulls. We don't want to copy the Sets (since they may be large) so instead we'll wrap
// it in a null-checking lambda if needed.
values.contains(null);
// Safe to do values.contains(null).
return values::contains;
}
catch (NullPointerException ignored) {
// Fall through
}
// Not safe to do values.contains(null); must return a wrapper.
// Return false for null, since an exception means the set cannot accept null (and therefore does not include it).
return value -> value != null && values.contains(value);
return values::contains;
}
private static DruidLongPredicate createLongPredicate(final Set<String> values)
@ -558,7 +525,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
InFilterDruidPredicateFactory(
final ExtractionFn extractionFn,
final Set<String> values
final ValuesSet values
)
{
this.extractionFn = extractionFn;
@ -638,4 +605,50 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
return Objects.hash(extractionFn, values);
}
}
public static class ValuesSet extends ForwardingSortedSet<String>
{
private final SortedSet<String> values;
public ValuesSet()
{
this.values = new TreeSet<>(Comparators.naturalNullsFirst());
}
/**
* Create a ValuesSet from another Collection. The Collection will be reused if it is a {@link SortedSet} with
* an appropriate comparator.
*/
public ValuesSet(final Collection<String> values)
{
if (values instanceof SortedSet && Comparators.naturalNullsFirst()
.equals(((SortedSet<String>) values).comparator())) {
this.values = (SortedSet<String>) values;
} else {
this.values = new TreeSet<>(Comparators.naturalNullsFirst());
this.values.addAll(values);
}
}
public SortedSet<ByteBuffer> toUtf8()
{
final TreeSet<ByteBuffer> valuesUtf8 = new TreeSet<>(ByteBufferUtils.unsignedComparator());
for (final String value : values) {
if (value == null) {
valuesUtf8.add(null);
} else {
valuesUtf8.add(ByteBuffer.wrap(StringUtils.toUtf8(value)));
}
}
return valuesUtf8;
}
@Override
protected SortedSet<String> delegate()
{
return values;
}
}
}

View File

@ -294,9 +294,8 @@ public class LikeDimFilter extends AbstractOptimizableDimFilter implements DimFi
}
/**
* Checks if the suffix of strings.get(i) matches the suffix of this matcher. The first prefix.length characters
* of s are ignored. This method is useful if you've already independently verified the prefix. This method
* evalutes strings.get(i) lazily to save time when it isn't necessary to actually look at the string.
* Checks if the suffix of "value" matches the suffix of this matcher. The first prefix.length() characters
* of "value" are ignored. This method is useful if you've already independently verified the prefix.
*/
public boolean matchesSuffixOnly(@Nullable String value)
{

View File

@ -34,7 +34,6 @@ import org.apache.druid.segment.filter.DimensionPredicateFilter;
import org.apache.druid.segment.filter.SelectorFilter;
import javax.annotation.Nullable;
import java.util.Collections;
import java.util.Objects;
import java.util.Set;
@ -96,7 +95,9 @@ public class SelectorDimFilter extends AbstractOptimizableDimFilter implements D
@Override
public DimFilter optimize()
{
return new InDimFilter(dimension, Collections.singleton(value), extractionFn, filterTuning).optimize();
final InDimFilter.ValuesSet valuesSet = new InDimFilter.ValuesSet();
valuesSet.add(value);
return new InDimFilter(dimension, valuesSet, extractionFn, filterTuning).optimize();
}
@Override

View File

@ -20,9 +20,9 @@
package org.apache.druid.query.topn;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
import org.apache.druid.java.util.common.granularity.Granularities;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.query.BaseQuery;
import org.apache.druid.query.DataSource;
import org.apache.druid.query.TableDataSource;
@ -45,6 +45,7 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;
/**
@ -234,7 +235,8 @@ public class TopNQueryBuilder
public TopNQueryBuilder filters(String dimensionName, String value, String... values)
{
final Set<String> filterValues = Sets.newHashSet(values);
final Set<String> filterValues = new TreeSet<>(Comparators.naturalNullsFirst());
filterValues.addAll(Arrays.asList(values));
filterValues.add(value);
dimFilter = new InDimFilter(dimensionName, filterValues);
return this;

View File

@ -460,6 +460,7 @@ public class IndexIO
new DictionaryEncodedStringIndexSupplier(
new ConciseBitmapFactory(),
index.getDimValueLookup(dimension),
index.getDimValueUtf8Lookup(dimension),
bitmaps,
spatialIndex
),

View File

@ -39,6 +39,8 @@ public interface ColumnIndexSupplier
* {@link org.apache.druid.segment.data.Offset} to form the basis of a {@link org.apache.druid.segment.Cursor}
* (or {@link org.apache.druid.segment.vector.VectorOffset} and {@link org.apache.druid.segment.vector.VectorCursor})
* which can greatly reduce the total number of rows which need to be scanned and processed.
*
* Objects returned by this method are not thread-safe.
*/
@Nullable
<T> T as(Class<T> clazz);

View File

@ -30,21 +30,21 @@ import javax.annotation.Nullable;
public interface LexicographicalRangeIndex
{
/**
* Get an {@link BitmapColumnIndex} corresponding to the values supplied in the specified range
* Get a {@link BitmapColumnIndex} corresponding to the values supplied in the specified range.
*/
default BitmapColumnIndex forRange(
BitmapColumnIndex forRange(
@Nullable String startValue,
boolean startStrict,
@Nullable String endValue,
boolean endStrict
)
{
return forRange(startValue, startStrict, endValue, endStrict, (index) -> true);
}
);
/**
* Get an {@link BitmapColumnIndex} corresponding to the values supplied in the specified range whose dictionary ids
* also match some predicate, such as to match a prefix
* Get a {@link BitmapColumnIndex} corresponding to the values supplied in the specified range whose dictionary ids
* also match some predicate, such as to match a prefix.
*
* If the provided {@code} matcher is always true, it's better to use the other
* {@link #forRange(String, boolean, String, boolean)} method.
*/
BitmapColumnIndex forRange(
@Nullable String startValue,

View File

@ -22,7 +22,7 @@ package org.apache.druid.segment.column;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import javax.annotation.Nullable;
import java.util.Set;
import java.util.SortedSet;
/**
* Index on individual values, and provides bitmaps for the rows which contain these values
@ -36,7 +36,8 @@ public interface StringValueSetIndex
/**
* Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the specified set of values (if they are
* contained in the underlying column)
* contained in the underlying column). The set must be sorted using
* {@link org.apache.druid.java.util.common.guava.Comparators#naturalNullsFirst()}.
*/
BitmapColumnIndex forValues(Set<String> values);
BitmapColumnIndex forSortedValues(SortedSet<String> values);
}

View File

@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.segment.column;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import java.nio.ByteBuffer;
import java.util.SortedSet;
public interface Utf8ValueSetIndex
{
/**
* Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the specified set of values (if they are
* contained in the underlying column). The set must be sorted using
* {@link org.apache.druid.java.util.common.ByteBufferUtils#unsignedComparator()}.
*/
BitmapColumnIndex forSortedValuesUtf8(SortedSet<ByteBuffer> valuesUtf8);
}

View File

@ -28,6 +28,7 @@ import org.apache.druid.extendedset.intset.ImmutableConciseSet;
import java.nio.ByteBuffer;
/**
*
*/
public class ConciseBitmapSerdeFactory implements BitmapSerdeFactory
{
@ -70,6 +71,12 @@ public class ConciseBitmapSerdeFactory implements BitmapSerdeFactory
return val.toBytes();
}
@Override
public boolean canCompare()
{
return false;
}
@Override
public int compare(ImmutableBitmap o1, ImmutableBitmap o2)
{

View File

@ -70,6 +70,12 @@ public class DecompressingByteBufferObjectStrategy implements ObjectStrategy<Res
};
}
@Override
public boolean canCompare()
{
return false;
}
@Override
public int compare(ResourceHolder<ByteBuffer> o1, ResourceHolder<ByteBuffer> o2)
{

View File

@ -24,6 +24,7 @@ import org.apache.druid.collections.ResourceHolder;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.common.utils.SerializerUtils;
import org.apache.druid.io.Channels;
import org.apache.druid.java.util.common.ByteBufferUtils;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.guava.Comparators;
@ -132,9 +133,9 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
}
@Override
public int compare(ByteBuffer o1, ByteBuffer o2)
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
{
return o1.compareTo(o2);
return ByteBufferUtils.unsignedComparator().compare(o1, o2);
}
};
@ -217,7 +218,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
public static <T> GenericIndexed<T> fromIterable(Iterable<T> objectsIterable, ObjectStrategy<T> strategy)
{
return fromIterableVersionOne(objectsIterable, strategy, true, strategy);
return fromIterableVersionOne(objectsIterable, strategy, strategy.canCompare(), strategy);
}
static int getNumberOfFilesRequired(int bagSize, long numWritten)
@ -350,17 +351,13 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
if (!allowReverseLookup) {
throw new UnsupportedOperationException("Reverse lookup not allowed.");
}
return indexOf(this, value);
}
private int indexOf(Indexed<T> indexed, @Nullable T value)
{
int minIndex = 0;
int maxIndex = size - 1;
while (minIndex <= maxIndex) {
int currIndex = (minIndex + maxIndex) >>> 1;
T currValue = indexed.get(currIndex);
T currValue = get(currIndex);
int comparison = strategy.compare(currValue, value);
if (comparison == 0) {
return currIndex;
@ -458,6 +455,9 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
return sb.toString();
}
/**
* Single-threaded view.
*/
abstract class BufferIndexed implements Indexed<T>
{
int lastReadSize;
@ -468,8 +468,23 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
return size;
}
@Override
public T get(final int index)
{
final ByteBuffer buf = getByteBuffer(index);
if (buf == null) {
return null;
}
// Traditionally, ObjectStrategy.fromByteBuffer() is given a buffer with limit set to capacity, and the
// actual limit is passed along as an extra parameter.
final int len = buf.remaining();
buf.limit(buf.capacity());
return strategy.fromByteBuffer(buf, len);
}
@Nullable
T bufferedIndexedGet(ByteBuffer copyValueBuffer, int startOffset, int endOffset)
ByteBuffer bufferedIndexedGetByteBuffer(ByteBuffer copyValueBuffer, int startOffset, int endOffset)
{
int size = endOffset - startOffset;
// When size is 0 and SQL compatibility is enabled also check for null marker before returning null.
@ -481,14 +496,22 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
lastReadSize = size;
// ObjectStrategy.fromByteBuffer() is allowed to reset the limit of the buffer. So if the limit is changed,
// position() call in the next line could throw an exception, if the position is set beyond the new limit. clear()
// sets the limit to the maximum possible, the capacity. It is safe to reset the limit to capacity, because the
// value buffer(s) initial limit equals to capacity.
copyValueBuffer.clear();
// position() call could throw an exception, if the position is set beyond the new limit. Calling limit()
// followed by position() is safe, because limit() resets position if needed.
copyValueBuffer.limit(endOffset);
copyValueBuffer.position(startOffset);
return strategy.fromByteBuffer(copyValueBuffer, size);
return copyValueBuffer;
}
/**
* Like {@link #get(int)}, but returns a {@link ByteBuffer} instead of using the {@link ObjectStrategy}.
*
* The returned ByteBuffer is reused by future calls. Callers must discard it before calling another method
* on this BufferedIndexed object that may want to reuse the buffer.
*/
@Nullable
protected abstract ByteBuffer getByteBuffer(int index);
/**
* This method makes no guarantees with respect to thread safety
*
@ -502,7 +525,41 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
@Override
public int indexOf(@Nullable T value)
{
return GenericIndexed.this.indexOf(this, value);
if (!allowReverseLookup) {
throw new UnsupportedOperationException("Reverse lookup not allowed.");
}
//noinspection ObjectEquality
final boolean isByteBufferStrategy = strategy == BYTE_BUFFER_STRATEGY;
int minIndex = 0;
int maxIndex = size - 1;
while (minIndex <= maxIndex) {
int currIndex = (minIndex + maxIndex) >>> 1;
int comparison;
if (isByteBufferStrategy) {
// Specialization avoids ByteBuffer allocation in strategy.fromByteBuffer.
ByteBuffer currValue = getByteBuffer(currIndex);
comparison = ByteBufferUtils.compareByteBuffers(currValue, (ByteBuffer) value);
} else {
T currValue = get(currIndex);
comparison = strategy.compare(currValue, value);
}
if (comparison == 0) {
return currIndex;
}
if (comparison < 0) {
minIndex = currIndex + 1;
} else {
maxIndex = currIndex - 1;
}
}
return -(minIndex + 1);
}
@Override
@ -625,8 +682,9 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
final ByteBuffer copyBuffer = firstValueBuffer.asReadOnlyBuffer();
return new BufferIndexed()
{
@Nullable
@Override
public T get(final int index)
protected ByteBuffer getByteBuffer(final int index)
{
checkIndex(index);
@ -641,7 +699,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
startOffset = headerBuffer.getInt(headerPosition) + Integer.BYTES;
endOffset = headerBuffer.getInt(headerPosition + Integer.BYTES);
}
return bufferedIndexedGet(copyBuffer, startOffset, endOffset);
return bufferedIndexedGetByteBuffer(copyBuffer, startOffset, endOffset);
}
@Override
@ -734,8 +792,9 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
return new BufferIndexed()
{
@Nullable
@Override
public T get(final int index)
protected ByteBuffer getByteBuffer(int index)
{
checkIndex(index);
@ -753,7 +812,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
endOffset = headerBuffer.getInt(headerPosition + Integer.BYTES);
}
int fileNum = index >> logBaseTwoOfElementsPerValueFile;
return bufferedIndexedGet(copyValueBuffers[fileNum], startOffset, endOffset);
return bufferedIndexedGetByteBuffer(copyValueBuffers[fileNum], startOffset, endOffset);
}
@Override

View File

@ -124,6 +124,12 @@ public class GenericIndexedWriter<T> implements Serializer
val.position(valPos);
}
@Override
public boolean canCompare()
{
return false;
}
@Override
public int compare(ByteBuffer o1, ByteBuffer o2)
{

View File

@ -51,6 +51,14 @@ public interface ObjectStrategy<T> extends Comparator<T>
@Nullable
byte[] toBytes(@Nullable T val);
/**
* Whether {@link #compare} is valid or not.
*/
default boolean canCompare()
{
return true;
}
/**
* Reads 4-bytes numBytes from the given buffer, and then delegates to {@link #fromByteBuffer(ByteBuffer, int)}.
*/

View File

@ -95,6 +95,12 @@ public class RoaringBitmapSerdeFactory implements BitmapSerdeFactory
return val.toBytes();
}
@Override
public boolean canCompare()
{
return false;
}
@Override
public int compare(ImmutableBitmap o1, ImmutableBitmap o2)
{

View File

@ -94,7 +94,12 @@ public class LikeFilter implements Filter
if (rangeIndex != null) {
final String lower = NullHandling.nullToEmptyIfNeeded(likeMatcher.getPrefix());
final String upper = NullHandling.nullToEmptyIfNeeded(likeMatcher.getPrefix()) + Character.MAX_VALUE;
return rangeIndex.forRange(lower, false, upper, false, likeMatcher::matchesSuffixOnly);
if (likeMatcher.getSuffixMatch() == LikeDimFilter.LikeMatcher.SuffixMatch.MATCH_ANY) {
return rangeIndex.forRange(lower, false, upper, false);
} else {
return rangeIndex.forRange(lower, false, upper, false, likeMatcher::matchesSuffixOnly);
}
}
}

View File

@ -71,10 +71,10 @@ public interface Joinable extends ReferenceCountedObject
* @param condition join condition for the matcher
* @param remainderNeeded whether or not {@link JoinMatcher#matchRemainder()} will ever be called on the
* matcher. If we know it will not, additional optimizations are often possible.
*
* @param descending true if join cursor is iterated in descending order
* @param closer closer that will run after join cursor has completed to clean up any per query
* resources the joinable uses
*
* @return the matcher
*/
JoinMatcher makeJoinMatcher(
@ -89,6 +89,10 @@ public interface Joinable extends ReferenceCountedObject
* Returns all nonnull values from a particular column if they are all unique, if there are "maxNumValues" or fewer,
* and if the column exists and supports this operation. Otherwise, returns an empty Optional.
*
* The returned set may be passed to {@link org.apache.druid.query.filter.InDimFilter}. For efficiency,
* implementations should prefer creating the returned set with
* {@code new TreeSet<String>(Comparators.naturalNullsFirst()}}. This avoids a copy in the filter's constructor.
*
* @param columnName name of the column
* @param maxNumValues maximum number of values to return
*/
@ -98,13 +102,18 @@ public interface Joinable extends ReferenceCountedObject
* Searches a column from this Joinable for a particular value, finds rows that match,
* and returns values of a second column for those rows.
*
* @param searchColumnName Name of the search column. This is the column that is being used in the filter
* @param searchColumnValue Target value of the search column. This is the value that is being filtered on.
* @param retrievalColumnName The column to retrieve values from. This is the column that is being joined against.
* @param maxCorrelationSetSize Maximum number of values to retrieve. If we detect that more values would be
* returned than this limit, return absent.
* The returned set may be passed to {@link org.apache.druid.query.filter.InDimFilter}. For efficiency,
* implementations should prefer creating the returned set with
* {@code new TreeSet<String>(Comparators.naturalNullsFirst()}}. This avoids a copy in the filter's constructor.
*
* @param searchColumnName Name of the search column. This is the column that is being used in the filter
* @param searchColumnValue Target value of the search column. This is the value that is being filtered on.
* @param retrievalColumnName The column to retrieve values from. This is the column that is being joined against.
* @param maxCorrelationSetSize Maximum number of values to retrieve. If we detect that more values would be
* returned than this limit, return absent.
* @param allowNonKeyColumnSearch If true, allow searchs on non-key columns. If this is false,
* a search on a non-key column returns absent.
*
* @return The set of correlated column values. If we cannot determine correlated values, return absent.
*
* In case either the search or retrieval column names are not found, this will return absent.

View File

@ -33,7 +33,6 @@ import org.apache.druid.segment.join.Joinable;
import javax.annotation.Nullable;
import java.io.Closeable;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
@ -104,10 +103,7 @@ public class IndexedTableJoinable implements Joinable
try (final IndexedTable.Reader reader = table.columnReader(columnPosition)) {
// Sorted set to encourage "in" filters that result from this method to do dictionary lookups in order.
// The hopes are that this will improve locality and therefore improve performance.
//
// Note: we are using Comparators.naturalNullsFirst() because it prevents the need for lambda-wrapping in
// InDimFilter's "createStringPredicate" method.
final Set<String> allValues = new TreeSet<>(Comparators.naturalNullsFirst());
final Set<String> allValues = createValuesSet();
for (int i = 0; i < table.numRows(); i++) {
final String s = DimensionHandlerUtils.convertObjectToString(reader.read(i));
@ -147,7 +143,7 @@ public class IndexedTableJoinable implements Joinable
return Optional.empty();
}
try (final Closer closer = Closer.create()) {
Set<String> correlatedValues = new HashSet<>();
Set<String> correlatedValues = createValuesSet();
if (table.keyColumns().contains(searchColumnName)) {
IndexedTable.Index index = table.columnIndex(filterColumnPosition);
IndexedTable.Reader reader = table.columnReader(correlatedColumnPosition);
@ -196,4 +192,12 @@ public class IndexedTableJoinable implements Joinable
{
return table.acquireReferences();
}
/**
* Create a Set that InDimFilter will accept without incurring a copy.
*/
private static Set<String> createValuesSet()
{
return new TreeSet<>(Comparators.naturalNullsFirst());
}
}

View File

@ -365,6 +365,7 @@ public class DictionaryEncodedColumnPartSerde implements ColumnPartSerde
new DictionaryEncodedStringIndexSupplier(
bitmapSerdeFactory.getBitmapFactory(),
rDictionary,
rDictionaryUtf8,
rBitmaps,
rSpatialIndex
),

View File

@ -20,6 +20,7 @@
package org.apache.druid.segment.serde;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import it.unimi.dsi.fastutil.ints.IntIntImmutablePair;
import it.unimi.dsi.fastutil.ints.IntIntPair;
import it.unimi.dsi.fastutil.ints.IntIterator;
@ -27,6 +28,7 @@ import org.apache.druid.collections.bitmap.BitmapFactory;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.collections.spatial.ImmutableRTree;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.BitmapResultFactory;
import org.apache.druid.query.filter.DruidPredicateFactory;
import org.apache.druid.segment.IntListUtils;
@ -39,13 +41,16 @@ import org.apache.druid.segment.column.LexicographicalRangeIndex;
import org.apache.druid.segment.column.SimpleColumnIndexCapabilities;
import org.apache.druid.segment.column.SpatialIndex;
import org.apache.druid.segment.column.StringValueSetIndex;
import org.apache.druid.segment.column.Utf8ValueSetIndex;
import org.apache.druid.segment.data.GenericIndexed;
import org.apache.druid.segment.data.Indexed;
import org.apache.druid.segment.filter.Filters;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.SortedSet;
public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
{
@ -53,6 +58,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
private final BitmapFactory bitmapFactory;
private final GenericIndexed<String> dictionary;
private final GenericIndexed<ByteBuffer> dictionaryUtf8;
@Nullable
private final GenericIndexed<ImmutableBitmap> bitmaps;
@Nullable
@ -61,29 +67,38 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
public DictionaryEncodedStringIndexSupplier(
BitmapFactory bitmapFactory,
GenericIndexed<String> dictionary,
GenericIndexed<ImmutableBitmap> bitmaps,
ImmutableRTree indexedTree
GenericIndexed<ByteBuffer> dictionaryUtf8,
@Nullable GenericIndexed<ImmutableBitmap> bitmaps,
@Nullable ImmutableRTree indexedTree
)
{
this.bitmapFactory = bitmapFactory;
this.bitmaps = bitmaps;
this.dictionary = dictionary;
this.dictionaryUtf8 = dictionaryUtf8;
this.bitmaps = bitmaps;
this.indexedTree = indexedTree;
}
@Nullable
@Override
@SuppressWarnings("unchecked")
public <T> T as(Class<T> clazz)
{
if (clazz.equals(StringValueSetIndex.class)) {
return (T) new GenericIndexedDictionaryEncodedStringValueSetIndex(bitmapFactory, dictionary, bitmaps);
} else if (clazz.equals(DruidPredicateIndex.class)) {
if (bitmaps != null && clazz.equals(StringValueSetIndex.class)) {
return (T) new GenericIndexedDictionaryEncodedStringValueSetIndex(bitmapFactory, dictionaryUtf8, bitmaps);
} else if (bitmaps != null && clazz.equals(Utf8ValueSetIndex.class)) {
return (T) new GenericIndexedDictionaryEncodedStringValueSetIndex(bitmapFactory, dictionaryUtf8, bitmaps);
} else if (bitmaps != null && clazz.equals(DruidPredicateIndex.class)) {
return (T) new GenericIndexedDictionaryEncodedStringDruidPredicateIndex(bitmapFactory, dictionary, bitmaps);
} else if (clazz.equals(LexicographicalRangeIndex.class)) {
return (T) new GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex(bitmapFactory, dictionary, bitmaps);
} else if (clazz.equals(DictionaryEncodedStringValueIndex.class)) {
} else if (bitmaps != null && clazz.equals(LexicographicalRangeIndex.class)) {
return (T) new GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex(
bitmapFactory,
dictionaryUtf8,
bitmaps
);
} else if (bitmaps != null && clazz.equals(DictionaryEncodedStringValueIndex.class)) {
return (T) new GenericIndexedDictionaryEncodedStringValueIndex(bitmapFactory, dictionary, bitmaps);
} else if (clazz.equals(SpatialIndex.class)) {
} else if (indexedTree != null && clazz.equals(SpatialIndex.class)) {
return (T) (SpatialIndex) () -> indexedTree;
}
return null;
@ -98,21 +113,21 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
}
}
private abstract static class BaseGenericIndexedDictionaryEncodedStringIndex
private abstract static class BaseGenericIndexedDictionaryEncodedIndex<T>
{
protected final BitmapFactory bitmapFactory;
protected final GenericIndexed<String> dictionary;
protected final GenericIndexed<ImmutableBitmap> bitmaps;
protected final Indexed<T> dictionary;
protected final Indexed<ImmutableBitmap> bitmaps;
protected BaseGenericIndexedDictionaryEncodedStringIndex(
protected BaseGenericIndexedDictionaryEncodedIndex(
BitmapFactory bitmapFactory,
GenericIndexed<String> dictionary,
GenericIndexed<T> dictionary,
GenericIndexed<ImmutableBitmap> bitmaps
)
{
this.bitmapFactory = bitmapFactory;
this.dictionary = dictionary;
this.bitmaps = bitmaps;
this.dictionary = dictionary.singleThreaded();
this.bitmaps = bitmaps.singleThreaded();
}
public ImmutableBitmap getBitmap(int idx)
@ -127,7 +142,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
}
public static final class GenericIndexedDictionaryEncodedStringValueIndex
extends BaseGenericIndexedDictionaryEncodedStringIndex implements DictionaryEncodedStringValueIndex
extends BaseGenericIndexedDictionaryEncodedIndex<String> implements DictionaryEncodedStringValueIndex
{
public GenericIndexedDictionaryEncodedStringValueIndex(
BitmapFactory bitmapFactory,
@ -165,12 +180,13 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
}
public static final class GenericIndexedDictionaryEncodedStringValueSetIndex
extends BaseGenericIndexedDictionaryEncodedStringIndex implements StringValueSetIndex
extends BaseGenericIndexedDictionaryEncodedIndex<ByteBuffer> implements StringValueSetIndex, Utf8ValueSetIndex
{
private static final int SIZE_WORTH_CHECKING_MIN = 8;
public GenericIndexedDictionaryEncodedStringValueSetIndex(
BitmapFactory bitmapFactory,
GenericIndexed<String> dictionary,
GenericIndexed<ByteBuffer> dictionary,
GenericIndexed<ImmutableBitmap> bitmaps
)
{
@ -197,14 +213,43 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
private ImmutableBitmap getBitmapForValue()
{
final int idx = dictionary.indexOf(value);
final ByteBuffer valueUtf8 = value == null ? null : ByteBuffer.wrap(StringUtils.toUtf8(value));
final int idx = dictionary.indexOf(valueUtf8);
return getBitmap(idx);
}
};
}
@Override
public BitmapColumnIndex forValues(Set<String> values)
public BitmapColumnIndex forSortedValues(SortedSet<String> values)
{
return getBitmapColumnIndexForSortedIterableUtf8(
Iterables.transform(
values,
input -> ByteBuffer.wrap(StringUtils.toUtf8(input))
)
);
}
@Override
public BitmapColumnIndex forSortedValuesUtf8(SortedSet<ByteBuffer> valuesUtf8)
{
final SortedSet<ByteBuffer> tailSet;
if (valuesUtf8.size() >= SIZE_WORTH_CHECKING_MIN) {
final ByteBuffer minValueInColumn = dictionary.get(0);
tailSet = valuesUtf8.tailSet(minValueInColumn);
} else {
tailSet = valuesUtf8;
}
return getBitmapColumnIndexForSortedIterableUtf8(tailSet);
}
/**
* Helper used by {@link #forSortedValues} and {@link #forSortedValuesUtf8}.
*/
private BitmapColumnIndex getBitmapColumnIndexForSortedIterableUtf8(Iterable<ByteBuffer> valuesUtf8)
{
return new DictionaryEncodedStringBitmapColumnIndex()
{
@ -222,9 +267,11 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
private Iterable<ImmutableBitmap> getBitmapsIterable()
{
final int dictionarySize = dictionary.size();
return () -> new Iterator<ImmutableBitmap>()
{
final Iterator<String> iterator = values.iterator();
final Iterator<ByteBuffer> iterator = valuesUtf8.iterator();
int next = -1;
@Override
@ -253,8 +300,15 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
private void findNext()
{
while (next < 0 && iterator.hasNext()) {
String nextValue = iterator.next();
ByteBuffer nextValue = iterator.next();
next = dictionary.indexOf(nextValue);
if (next == -dictionarySize - 1) {
// nextValue is past the end of the dictionary.
// Note: we can rely on indexOf returning (-(insertion point) - 1), even though Indexed doesn't
// guarantee it, because "dictionary" comes from GenericIndexed singleThreaded().
break;
}
}
}
};
@ -264,7 +318,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
}
public static final class GenericIndexedDictionaryEncodedStringDruidPredicateIndex
extends BaseGenericIndexedDictionaryEncodedStringIndex implements DruidPredicateIndex
extends BaseGenericIndexedDictionaryEncodedIndex<String> implements DruidPredicateIndex
{
public GenericIndexedDictionaryEncodedStringDruidPredicateIndex(
BitmapFactory bitmapFactory,
@ -350,12 +404,12 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
}
public static final class GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex
extends BaseGenericIndexedDictionaryEncodedStringIndex implements LexicographicalRangeIndex
extends BaseGenericIndexedDictionaryEncodedIndex<ByteBuffer> implements LexicographicalRangeIndex
{
public GenericIndexedDictionaryEncodedColumnLexicographicalRangeIndex(
BitmapFactory bitmapFactory,
GenericIndexed<String> dictionary,
GenericIndexed<ByteBuffer> dictionary,
GenericIndexed<ImmutableBitmap> bitmaps
)
{
@ -447,7 +501,7 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
private int findNext()
{
while (currIndex < end && !matcher.apply(dictionary.get(currIndex))) {
while (currIndex < end && !applyMatcher(dictionary.get(currIndex))) {
currIndex++;
}
@ -478,16 +532,32 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
}
};
}
private boolean applyMatcher(@Nullable final ByteBuffer valueUtf8)
{
if (valueUtf8 == null) {
return matcher.apply(null);
} else {
// Duplicate buffer, because StringUtils.fromUtf8 advances the position, and we do not want to do that.
return matcher.apply(StringUtils.fromUtf8(valueUtf8.duplicate()));
}
}
};
}
private IntIntPair getRange(@Nullable String startValue, boolean startStrict, @Nullable String endValue, boolean endStrict)
private IntIntPair getRange(
@Nullable String startValue,
boolean startStrict,
@Nullable String endValue,
boolean endStrict
)
{
int startIndex, endIndex;
if (startValue == null) {
startIndex = 0;
} else {
final int found = dictionary.indexOf(NullHandling.emptyToNullIfNeeded(startValue));
final String startValueToUse = NullHandling.emptyToNullIfNeeded(startValue);
final int found = dictionary.indexOf(StringUtils.toUtf8ByteBuffer(startValueToUse));
if (found >= 0) {
startIndex = startStrict ? found + 1 : found;
} else {
@ -498,7 +568,8 @@ public class DictionaryEncodedStringIndexSupplier implements ColumnIndexSupplier
if (endValue == null) {
endIndex = dictionary.size();
} else {
final int found = dictionary.indexOf(NullHandling.emptyToNullIfNeeded(endValue));
final String endValueToUse = NullHandling.emptyToNullIfNeeded(endValue);
final int found = dictionary.indexOf(StringUtils.toUtf8ByteBuffer(endValueToUse));
if (found >= 0) {
endIndex = endStrict ? found : found + 1;
} else {

View File

@ -23,6 +23,7 @@ import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.query.BitmapResultFactory;
@ -60,6 +61,7 @@ import java.util.List;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
/**
* {@link VirtualColumn} form of {@link ListFilteredDimensionSpec}, powered by
@ -397,7 +399,7 @@ public class ListFilteredVirtualColumn implements VirtualColumn
}
@Override
public BitmapColumnIndex forValues(Set<String> values)
public BitmapColumnIndex forSortedValues(SortedSet<String> values)
{
return new BaseVirtualIndex()
{
@ -505,6 +507,17 @@ public class ListFilteredVirtualColumn implements VirtualColumn
super(delegate, idMapping);
}
@Override
public BitmapColumnIndex forRange(
@Nullable String startValue,
boolean startStrict,
@Nullable String endValue,
boolean endStrict
)
{
return forRange(startValue, startStrict, endValue, endStrict, Predicates.alwaysTrue());
}
@Override
public BitmapColumnIndex forRange(
@Nullable String startValue,

View File

@ -31,11 +31,20 @@ import org.apache.druid.jackson.DefaultObjectMapper;
import org.apache.druid.query.extraction.RegexDimExtractionFn;
import org.apache.druid.segment.RowAdapters;
import org.apache.druid.segment.RowBasedColumnSelectorFactory;
import org.apache.druid.segment.column.BitmapColumnIndex;
import org.apache.druid.segment.column.ColumnIndexSupplier;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.RowSignature;
import org.apache.druid.segment.column.StringValueSetIndex;
import org.apache.druid.segment.column.Utf8ValueSetIndex;
import org.apache.druid.testing.InitializedNullHandlingTest;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.junit.MockitoJUnit;
import org.mockito.junit.MockitoRule;
import org.mockito.quality.Strictness;
import java.io.IOException;
import java.util.Arrays;
@ -52,6 +61,9 @@ public class InDimFilterTest extends InitializedNullHandlingTest
private final String serializedFilter =
"{\"type\":\"in\",\"dimension\":\"dimTest\",\"values\":[\"bad\",\"good\"]}";
@Rule
public MockitoRule mockitoRule = MockitoJUnit.rule().strictness(Strictness.STRICT_STUBS);
@Test
public void testDeserialization() throws IOException
{
@ -71,21 +83,23 @@ public class InDimFilterTest extends InitializedNullHandlingTest
@Test
public void testGetValuesWithValuesSetOfNonEmptyStringsUseTheGivenSet()
{
final Set<String> values = ImmutableSet.of("v1", "v2", "v3");
final InDimFilter filter = new InDimFilter("dim", values, null, null);
final Set<String> values = new InDimFilter.ValuesSet();
values.addAll(Arrays.asList("v1", "v2", "v3"));
final InDimFilter filter = new InDimFilter("dim", values);
Assert.assertSame(values, filter.getValues());
}
@Test
public void testGetValuesWithValuesSetIncludingEmptyString()
{
final Set<String> values = Sets.newHashSet("v1", "", "v3");
final InDimFilter filter = new InDimFilter("dim", values, null, null);
final InDimFilter.ValuesSet values = new InDimFilter.ValuesSet(ImmutableSet.of("v1", "", "v3"));
final InDimFilter filter = new InDimFilter("dim", values);
if (NullHandling.replaceWithDefault()) {
Assert.assertNotSame(values, filter.getValues());
Assert.assertSame(values, filter.getValues());
Assert.assertEquals(Sets.newHashSet("v1", null, "v3"), filter.getValues());
} else {
Assert.assertSame(values, filter.getValues());
Assert.assertEquals(Sets.newHashSet("v1", "", "v3"), filter.getValues());
}
}
@ -234,4 +248,53 @@ public class InDimFilterTest extends InitializedNullHandlingTest
// Now it *shouldn't* match.
Assert.assertFalse(matcher.matches());
}
@Test
public void testUsesUtf8SetIndex()
{
// An implementation test.
// This test confirms that "in" filters use utf8 index lookups when available.
final Filter inFilter = new InDimFilter("dim0", ImmutableSet.of("v1", "v2")).toFilter();
final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class);
final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class);
final Utf8ValueSetIndex valueIndex = Mockito.mock(Utf8ValueSetIndex.class);
final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class);
final InDimFilter.ValuesSet expectedValuesSet = new InDimFilter.ValuesSet();
expectedValuesSet.addAll(Arrays.asList("v1", "v2"));
Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier);
Mockito.when(indexSupplier.as(Utf8ValueSetIndex.class)).thenReturn(valueIndex);
Mockito.when(valueIndex.forSortedValuesUtf8(expectedValuesSet.toUtf8())).thenReturn(bitmapColumnIndex);
final BitmapColumnIndex retVal = inFilter.getBitmapColumnIndex(indexSelector);
Assert.assertSame("inFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal);
}
@Test
public void testUsesStringSetIndex()
{
// An implementation test.
// This test confirms that "in" filters use non-utf8 string index lookups when utf8 indexes are not available.
final Filter inFilter = new InDimFilter("dim0", ImmutableSet.of("v1", "v2")).toFilter();
final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class);
final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class);
final StringValueSetIndex valueIndex = Mockito.mock(StringValueSetIndex.class);
final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class);
final InDimFilter.ValuesSet expectedValuesSet = new InDimFilter.ValuesSet();
expectedValuesSet.addAll(Arrays.asList("v1", "v2"));
Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier);
Mockito.when(indexSupplier.as(Utf8ValueSetIndex.class)).thenReturn(null); // Will check for UTF-8 first.
Mockito.when(indexSupplier.as(StringValueSetIndex.class)).thenReturn(valueIndex);
Mockito.when(valueIndex.forSortedValues(expectedValuesSet)).thenReturn(bitmapColumnIndex);
final BitmapColumnIndex retVal = inFilter.getBitmapColumnIndex(indexSelector);
Assert.assertSame("inFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal);
}
}

View File

@ -24,15 +24,27 @@ import com.google.common.collect.Sets;
import nl.jqno.equalsverifier.EqualsVerifier;
import org.apache.druid.jackson.DefaultObjectMapper;
import org.apache.druid.query.extraction.SubstringDimExtractionFn;
import org.apache.druid.segment.column.BitmapColumnIndex;
import org.apache.druid.segment.column.ColumnIndexSupplier;
import org.apache.druid.segment.column.LexicographicalRangeIndex;
import org.apache.druid.segment.column.StringValueSetIndex;
import org.apache.druid.testing.InitializedNullHandlingTest;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.junit.MockitoJUnit;
import org.mockito.junit.MockitoRule;
import org.mockito.quality.Strictness;
import java.io.IOException;
import java.util.Arrays;
public class LikeDimFilterTest extends InitializedNullHandlingTest
{
@Rule
public MockitoRule mockitoRule = MockitoJUnit.rule().strictness(Strictness.STRICT_STUBS);
@Test
public void testSerde() throws IOException
{
@ -88,4 +100,49 @@ public class LikeDimFilterTest extends InitializedNullHandlingTest
.withNonnullFields("suffixMatch", "prefix", "pattern")
.verify();
}
@Test
public void testPrefixMatchUsesRangeIndex()
{
// An implementation test.
// This test confirms that "like" filters with prefix matchers use index-range lookups without matcher predicates.
final Filter likeFilter = new LikeDimFilter("dim0", "f%", null, null, null).toFilter();
final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class);
final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class);
final LexicographicalRangeIndex rangeIndex = Mockito.mock(LexicographicalRangeIndex.class);
final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class);
Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier);
Mockito.when(indexSupplier.as(LexicographicalRangeIndex.class)).thenReturn(rangeIndex);
Mockito.when(
// Verify that likeFilter uses forRange without a matcher predicate; it's unnecessary and slows things down
rangeIndex.forRange("f", false, "f" + Character.MAX_VALUE, false)
).thenReturn(bitmapColumnIndex);
final BitmapColumnIndex retVal = likeFilter.getBitmapColumnIndex(indexSelector);
Assert.assertSame("likeFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal);
}
@Test
public void testExactMatchUsesValueIndex()
{
// An implementation test.
// This test confirms that "like" filters with exact matchers use index lookups.
final Filter likeFilter = new LikeDimFilter("dim0", "f", null, null, null).toFilter();
final ColumnIndexSelector indexSelector = Mockito.mock(ColumnIndexSelector.class);
final ColumnIndexSupplier indexSupplier = Mockito.mock(ColumnIndexSupplier.class);
final StringValueSetIndex valueIndex = Mockito.mock(StringValueSetIndex.class);
final BitmapColumnIndex bitmapColumnIndex = Mockito.mock(BitmapColumnIndex.class);
Mockito.when(indexSelector.getIndexSupplier("dim0")).thenReturn(indexSupplier);
Mockito.when(indexSupplier.as(StringValueSetIndex.class)).thenReturn(valueIndex);
Mockito.when(valueIndex.forValue("f")).thenReturn(bitmapColumnIndex);
final BitmapColumnIndex retVal = likeFilter.getBitmapColumnIndex(indexSelector);
Assert.assertSame("likeFilter returns the intended bitmapColumnIndex", bitmapColumnIndex, retVal);
}
}

View File

@ -26,6 +26,7 @@ import org.apache.druid.collections.bitmap.ConciseBitmapFactory;
import org.apache.druid.collections.bitmap.ImmutableBitmap;
import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.collections.bitmap.RoaringBitmapFactory;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.extraction.DimExtractionFn;
import org.apache.druid.query.extraction.ExtractionFn;
import org.apache.druid.query.filter.ColumnIndexSelector;
@ -49,6 +50,7 @@ import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.util.Collections;
import java.util.Map;
@ -117,6 +119,10 @@ public class ExtractionDimFilterTest extends InitializedNullHandlingTest
return new DictionaryEncodedStringIndexSupplier(
factory,
GenericIndexed.fromIterable(Collections.singletonList("foo1"), GenericIndexed.STRING_STRATEGY),
GenericIndexed.fromIterable(
Collections.singletonList(ByteBuffer.wrap(StringUtils.toUtf8("foo1"))),
GenericIndexed.BYTE_BUFFER_STRATEGY
),
GenericIndexed.fromIterable(Collections.singletonList(foo1BitMap), serdeFactory.getObjectStrategy()),
null
);

View File

@ -386,7 +386,7 @@ public class InFilterTest extends BaseFilterTest
EqualsVerifier.forClass(InDimFilter.class)
.usingGetClass()
.withNonnullFields("dimension", "values")
.withIgnoredFields("cacheKeySupplier", "predicateFactory", "cachedOptimizedFilter")
.withIgnoredFields("cacheKeySupplier", "predicateFactory", "cachedOptimizedFilter", "valuesUtf8")
.verify();
}

View File

@ -242,7 +242,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#en.wikipedia"),
new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter()
)
),
new SelectorFilter("rtc.countryName", "United States"),
@ -649,7 +649,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
))
)
),
new InDimFilter("countryIsoCode", ImmutableSet.of("CA"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("CA")).toFilter()
)
),
new AndFilter(
@ -751,7 +751,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#en.wikipedia"),
new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("SU"), null, null).toFilter()
new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("SU")).toFilter()
)
),
new SelectorFilter("rtc.countryName", "States United"),
@ -898,7 +898,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new OrFilter(
ImmutableList.of(
new SelectorFilter("channel", "#ko.wikipedia"),
new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter()
)
),
new OrFilter(
@ -906,8 +906,8 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new SelectorFilter("channel", "#ko.wikipedia"),
new AndFilter(
ImmutableList.of(
new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("VA"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("VA")).toFilter()
)
)
)
@ -1004,7 +1004,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#en.wikipedia"),
new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA"), null, null).toFilter()
new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA")).toFilter()
)
),
new SelectorFilter("c1.countryName", "Usca"),
@ -1094,7 +1094,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#en.wikipedia"),
new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA"), null, null).toFilter()
new InDimFilter("JOIN-FILTER-PUSHDOWN-VIRTUAL-COLUMN-0", ImmutableSet.of("USCA")).toFilter()
)
),
new SelectorFilter("c1.v", "Usca"),
@ -1164,7 +1164,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#de.wikipedia"),
new InDimFilter("countryIsoCode", ImmutableSet.of("DE"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("DE")).toFilter()
)
),
new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "countryName", "Germany"),
@ -1221,7 +1221,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#de.wikipedia"),
new InDimFilter("countryIsoCode", ImmutableSet.of("DE"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("DE")).toFilter()
)
),
new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "v", "Germany"),
@ -1453,7 +1453,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#en.wikipedia"),
new InDimFilter("countryNumber", ImmutableSet.of("0"), null, null).toFilter()
new InDimFilter("countryNumber", ImmutableSet.of("0")).toFilter()
)
),
new SelectorFilter(FACT_TO_COUNTRY_ON_NUMBER_PREFIX + "countryName", "Australia"),
@ -1516,7 +1516,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#en.wikipedia"),
new InDimFilter("countryNumber", ImmutableSet.of("0"), null, null).toFilter()
new InDimFilter("countryNumber", ImmutableSet.of("0")).toFilter()
)
),
new SelectorFilter(FACT_TO_COUNTRY_ON_NUMBER_PREFIX + "v", "Australia"),
@ -1683,7 +1683,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#es.wikipedia"),
new InDimFilter("countryIsoCode", ImmutableSet.of("SV"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("SV")).toFilter()
)
),
new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "countryName", "El Salvador"),
@ -1740,7 +1740,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new AndFilter(
ImmutableList.of(
new SelectorFilter("channel", "#es.wikipedia"),
new InDimFilter("countryIsoCode", ImmutableSet.of("SV"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("SV")).toFilter()
)
),
new SelectorFilter(FACT_TO_COUNTRY_ON_ISO_CODE_PREFIX + "v", "El Salvador"),
@ -1916,8 +1916,8 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
JoinFilterSplit expectedFilterSplit = new JoinFilterSplit(
new AndFilter(
ImmutableList.of(
new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM"), null, null).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM")).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM")).toFilter()
)
),
new SelectorFilter("r1.regionName", "Fourems Province"),
@ -1987,7 +1987,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
JoinFilterSplit expectedFilterSplit = new JoinFilterSplit(
new OrFilter(
ImmutableList.of(
new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM"), null, null).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM")).toFilter(),
new SelectorFilter("regionIsoCode", "AAAA")
)
),
@ -2288,7 +2288,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
null
)),
expressionFilter,
new InDimFilter("rtc.countryIsoCode", ImmutableSet.of("CA", "CA2", "CA3"), null, null).toFilter(),
new InDimFilter("rtc.countryIsoCode", ImmutableSet.of("CA", "CA2", "CA3")).toFilter(),
new OrFilter(
ImmutableList.of(
new SelectorFilter("channel", "#fr.wikipedia"),
@ -2414,7 +2414,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
ImmutableList.of(
new SelectorFilter(rewrittenRegionIsoCodeColumnName, "ON"),
new SelectorFilter(rewrittenCountryIsoCodeColumnName, "CA"),
new InDimFilter(rewrittenCountryIsoCodeColumnName, ImmutableSet.of("CA"), null, null).toFilter(),
new InDimFilter(rewrittenCountryIsoCodeColumnName, ImmutableSet.of("CA")).toFilter(),
new BoundFilter(new BoundDimFilter(
rewrittenCountryIsoCodeColumnName,
"CA",
@ -2427,9 +2427,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
)),
new InDimFilter(
rewrittenCountryIsoCodeColumnName,
ImmutableSet.of("CA", "CA2", "CA3"),
null,
null
ImmutableSet.of("CA", "CA2", "CA3")
).toFilter(),
new OrFilter(
ImmutableList.of(
@ -2453,9 +2451,7 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
new SelectorFilter(rewrittenCountryIsoCodeColumnName, "ABCDEF"),
new InDimFilter(
rewrittenCountryIsoCodeColumnName,
ImmutableSet.of("CA"),
null,
null
ImmutableSet.of("CA")
).toFilter(),
new BoundFilter(new BoundDimFilter(
rewrittenCountryIsoCodeColumnName,
@ -2549,10 +2545,10 @@ public class JoinFilterAnalyzerTest extends BaseHashJoinSegmentStorageAdapterTes
JoinFilterSplit expectedFilterSplit = new JoinFilterSplit(
new AndFilter(
ImmutableList.of(
new InDimFilter("countryIsoCode", ImmutableSet.of("US"), null, null).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("CA"), null, null).toFilter(),
new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM", "AAAA"), null, null).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM", "AAAA"), null, null).toFilter()
new InDimFilter("countryIsoCode", ImmutableSet.of("US")).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("CA")).toFilter(),
new InDimFilter("countryIsoCode", ImmutableSet.of("MMMM", "AAAA")).toFilter(),
new InDimFilter("regionIsoCode", ImmutableSet.of("MMMM", "AAAA")).toFilter()
)
),
originalFilter,

View File

@ -19,7 +19,6 @@
package org.apache.druid.sql.calcite.expression.builtin;
import com.google.common.collect.Sets;
import org.apache.calcite.rex.RexCall;
import org.apache.calcite.rex.RexNode;
import org.apache.calcite.sql.SqlFunction;
@ -40,6 +39,7 @@ import org.apache.druid.sql.calcite.planner.PlannerContext;
import org.apache.druid.sql.calcite.rel.VirtualColumnRegistry;
import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.List;
public class ArrayOverlapOperatorConversion extends BaseExpressionDimFilterOperatorConversion
@ -123,7 +123,7 @@ public class ArrayOverlapOperatorConversion extends BaseExpressionDimFilterOpera
} else {
return new InDimFilter(
simpleExtractionExpr.getSimpleExtraction().getColumn(),
Sets.newHashSet(arrayElements),
new InDimFilter.ValuesSet(Arrays.asList(arrayElements)),
simpleExtractionExpr.getSimpleExtraction().getExtractionFn(),
null
);

View File

@ -20,7 +20,6 @@
package org.apache.druid.sql.calcite.filtration;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.query.filter.DimFilter;
import org.apache.druid.query.filter.InDimFilter;
@ -34,7 +33,6 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class ConvertSelectorsToIns extends BottomUpTransform
{
@ -80,7 +78,7 @@ public class ConvertSelectorsToIns extends BottomUpTransform
final List<SelectorDimFilter> filterList = entry.getValue();
if (filterList.size() > 1) {
// We found a simplification. Remove the old filters and add new ones.
final Set<String> values = Sets.newHashSetWithExpectedSize(filterList.size());
final InDimFilter.ValuesSet values = new InDimFilter.ValuesSet();
for (final SelectorDimFilter selector : filterList) {
values.add(selector.getValue());