From 78d0b0abce2b27b88a53790684d81530a2318fc9 Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Wed, 16 Nov 2022 07:15:00 -0800 Subject: [PATCH] Add string comparison methods to StringUtils, fix dictionary comparisons. (#13364) * Add string comparison methods to StringUtils, fix dictionary comparisons. There are various places in Druid code where we assume that String.compareTo is consistent with Unicode code-point ordering. Sadly this is not the case. To help deal with this, this patch introduces the following helpers: 1) compareUnicode: Compares two Strings in Unicode code-point order. 2) compareUtf8: Compares two UTF-8 byte arrays in Unicode code-point order. Equivalent to comparison as unsigned bytes. 3) compareUtf8UsingJavaStringOrdering: Compares two UTF-8 byte arrays, or ByteBuffers, in a manner consistent with String.compareTo. There is no helper for comparing two Strings in a manner consistent with String.compareTo, because for that we can use compareTo directly. The patch also fixes an inconsistency between the String and UTF-8 dictionary GenericIndexed flavors of string-typed columns: they were formerly using incompatible comparators. * Adjust test. * FrontCodedIndexed updates. * Add test. * Fix comments. --- .../druid/benchmark/BoundFilterBenchmark.java | 2 +- ...ryEncodedStringIndexSupplierBenchmark.java | 2 +- .../DimensionPredicateFilterBenchmark.java | 2 +- .../benchmark/FrontCodedIndexedBenchmark.java | 2 +- .../druid/benchmark/InFilterBenchmark.java | 2 +- .../druid/benchmark/LikeFilterBenchmark.java | 2 +- .../java/util/common/ByteBufferUtils.java | 52 ++----- .../druid/java/util/common/StringUtils.java | 142 ++++++++++++++++++ .../java/util/common/ByteBufferUtilsTest.java | 59 ++++++-- .../java/util/common/StringUtilsTest.java | 96 ++++++++++++ .../druid/frame/write/FrameWriterUtils.java | 2 +- .../druid/query/filter/InDimFilter.java | 2 +- .../query/ordering/StringComparators.java | 8 +- .../org/apache/druid/segment/IndexIO.java | 2 +- .../column/IndexedUtf8ValueSetIndex.java | 2 +- .../segment/column/Utf8ValueSetIndex.java | 2 +- .../druid/segment/data/FrontCodedIndexed.java | 20 +-- .../segment/data/FrontCodedIndexedWriter.java | 24 ++- .../druid/segment/data/GenericIndexed.java | 13 +- .../nested/NestedDataColumnSupplier.java | 4 +- .../DictionaryEncodedColumnPartSerde.java | 2 +- .../segment/data/FrontCodedIndexedTest.java | 4 +- .../filter/ExtractionDimFilterTest.java | 2 +- .../PredicateValueMatcherFactoryTest.java | 4 +- .../segment/filter/ValueMatchersTest.java | 6 +- ...edFieldLiteralColumnIndexSupplierTest.java | 2 +- ...tionaryEncodedStringIndexSupplierTest.java | 4 +- 27 files changed, 359 insertions(+), 105 deletions(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java index cdb3cf2f7ca..819c528198b 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/BoundFilterBenchmark.java @@ -179,7 +179,7 @@ public class BoundFilterBenchmark final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( FluentIterable.from(ints) .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ); selector = new MockColumnIndexSelector( bitmapFactory, diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java index 282b25e1986..1806f28ad1d 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/DictionaryEncodedStringIndexSupplierBenchmark.java @@ -101,7 +101,7 @@ public class DictionaryEncodedStringIndexSupplierBenchmark final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( FluentIterable.from(ints) .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ); final GenericIndexed bitmaps = GenericIndexed.fromIterable( () -> IntStream.range(0, dictionarySize) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java index f01b09a79fe..34a35b559a0 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/DimensionPredicateFilterBenchmark.java @@ -130,7 +130,7 @@ public class DimensionPredicateFilterBenchmark final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( FluentIterable.from(ints) .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ); final GenericIndexed bitmaps = GenericIndexed.fromIterable( FluentIterable.from(ints) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java index 2dba1ba5c0e..30656630656 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/FrontCodedIndexedBenchmark.java @@ -174,7 +174,7 @@ public class FrontCodedIndexedBenchmark genericIndexed = GenericIndexed.read( byteBufferGeneric, - GenericIndexed.BYTE_BUFFER_STRATEGY, + GenericIndexed.UTF8_STRATEGY, SmooshedFileMapper.load(smooshDirFrontCoded) ); frontCodedIndexed = FrontCodedIndexed.read( diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java index 09c3253af08..0a97367493d 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/InFilterBenchmark.java @@ -93,7 +93,7 @@ public class InFilterBenchmark final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( FluentIterable.from(ints) .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ); final GenericIndexed bitmaps = GenericIndexed.fromIterable( () -> IntStream.range(0, dictionarySize) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java index 1369d69787d..bb85422792f 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/LikeFilterBenchmark.java @@ -130,7 +130,7 @@ public class LikeFilterBenchmark final GenericIndexed dictionaryUtf8 = GenericIndexed.fromIterable( FluentIterable.from(ints) .transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ); final GenericIndexed bitmaps = GenericIndexed.fromIterable( FluentIterable.from(ints) diff --git a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java index 8209049b844..fb67c323834 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/ByteBufferUtils.java @@ -47,7 +47,7 @@ public class ByteBufferUtils // null if unmap is supported private static final RuntimeException UNMAP_NOT_SUPPORTED_EXCEPTION; - private static final Comparator COMPARATOR_UNSIGNED = new UnsignedByteBufferComparator(); + private static final Comparator COMPARATOR_UTF8 = new Utf8ByteBufferComparator(); static { Object unmap = null; @@ -214,40 +214,12 @@ public class ByteBufferUtils } /** - * Compares two ByteBuffer ranges using unsigned byte ordering. + * Compares two ByteBuffers from their positions to their limits using ordering consistent with + * {@link String#compareTo(String)}. Null buffers are accepted, and are ordered earlier than any nonnull buffer. * - * Different from {@link ByteBuffer#compareTo}, which uses signed ordering. + * Different from {@link ByteBuffer#compareTo}, which uses signed-bytes ordering. */ - public static int compareByteBuffers( - final ByteBuffer buf1, - final int position1, - final int length1, - final ByteBuffer buf2, - final int position2, - final int length2 - ) - { - final int commonLength = Math.min(length1, length2); - - for (int i = 0; i < commonLength; i++) { - final byte byte1 = buf1.get(position1 + i); - final byte byte2 = buf2.get(position2 + i); - final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison - if (cmp != 0) { - return cmp; - } - } - - return Integer.compare(length1, length2); - } - - /** - * Compares two ByteBuffers from their positions to their limits using unsigned byte ordering. Accepts null - * buffers, which are ordered earlier than any nonnull buffer. - * - * Different from {@link ByteBuffer#compareTo}, which uses signed ordering. - */ - public static int compareByteBuffers( + public static int compareUtf8ByteBuffers( @Nullable final ByteBuffer buf1, @Nullable final ByteBuffer buf2 ) @@ -260,7 +232,7 @@ public class ByteBufferUtils return 1; } - return ByteBufferUtils.compareByteBuffers( + return StringUtils.compareUtf8UsingJavaStringOrdering( buf1, buf1.position(), buf1.remaining(), @@ -271,20 +243,20 @@ public class ByteBufferUtils } /** - * Comparator that compares two {@link ByteBuffer} using unsigned ordering. Null buffers are accepted, and - * are ordered earlier than any nonnull buffer. + * Comparator that compares two {@link ByteBuffer} using ordering consistent with {@link String#compareTo(String)}. + * Null buffers are accepted, and are ordered earlier than any nonnull buffer. */ - public static Comparator unsignedComparator() + public static Comparator utf8Comparator() { - return COMPARATOR_UNSIGNED; + return COMPARATOR_UTF8; } - private static class UnsignedByteBufferComparator implements Comparator + private static class Utf8ByteBufferComparator implements Comparator { @Override public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2) { - return ByteBufferUtils.compareByteBuffers(o1, o2); + return compareUtf8ByteBuffers(o1, o2); } } } diff --git a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java index bd17f42c403..41078961a0c 100644 --- a/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java +++ b/core/src/main/java/org/apache/druid/java/util/common/StringUtils.java @@ -77,6 +77,148 @@ public class StringUtils return string == null ? EMPTY_BYTES : toUtf8(string); } + /** + * Compares two Java Strings in Unicode code-point order. + * + * Order is consistent with {@link #compareUtf8(byte[], byte[])}, but is not consistent with + * {@link String#compareTo(String)}. + */ + public static int compareUnicode(final String a, final String b) + { + final int commonLength = Math.min(a.length(), b.length()); + + for (int i = 0; i < commonLength; i++) { + int char1 = a.charAt(i) & 0xFFFF; // Unsigned + int char2 = b.charAt(i) & 0xFFFF; // Unsigned + + if (char1 != char2 && char1 >= 0xd800 && char2 >= 0xd800) { + // Fixup logic for code units at or above the surrogate range, based on logic described at + // https://www.icu-project.org/docs/papers/utf16_code_point_order.html. + // + // If both code units are at or above the surrogate range (>= 0xd800) then adjust non-surrogates (legitimate + // single-code-unit characters) to be below the surrogate range, so they compare earlier than surrogates. + + if (!Character.isSurrogate((char) char1)) { + char1 -= 0x2800; + } + + if (!Character.isSurrogate((char) char2)) { + char2 -= 0x2800; + } + } + + final int cmp = char1 - char2; + if (cmp != 0) { + return cmp; + } + } + + return Integer.compare(a.length(), b.length()); + } + + /** + * Compares two UTF-8 byte strings in Unicode code-point order. + * + * Equivalent to a comparison of the two byte arrays as if they were unsigned bytes. + * + * Order is consistent with {@link #compareUnicode(String, String)}, but is not consistent with + * {@link String#compareTo(String)}. For an ordering consistent with {@link String#compareTo(String)}, use + * {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} instead. + */ + public static int compareUtf8(final byte[] a, final byte[] b) + { + final int commonLength = Math.min(a.length, b.length); + + for (int i = 0; i < commonLength; i++) { + final byte byte1 = a[i]; + final byte byte2 = b[i]; + final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison + if (cmp != 0) { + return cmp; + } + } + + return Integer.compare(a.length, b.length); + } + + /** + * Compares two UTF-8 byte strings in UTF-16 code-unit order. + * + * Order is consistent with {@link String#compareTo(String)}, but is not consistent with + * {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], byte[])}. + */ + public static int compareUtf8UsingJavaStringOrdering(final byte[] a, final byte[] b) + { + final int commonLength = Math.min(a.length, b.length); + + for (int i = 0; i < commonLength; i++) { + final int cmp = compareUtf8UsingJavaStringOrdering(a[i], b[i]); + if (cmp != 0) { + return cmp; + } + } + + return Integer.compare(a.length, b.length); + } + + /** + * Compares two UTF-8 byte strings in UTF-16 code-unit order. + * + * Order is consistent with {@link String#compareTo(String)}, but is not consistent with + * {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], byte[])}. + */ + public static int compareUtf8UsingJavaStringOrdering( + final ByteBuffer buf1, + final int position1, + final int length1, + final ByteBuffer buf2, + final int position2, + final int length2 + ) + { + final int commonLength = Math.min(length1, length2); + + for (int i = 0; i < commonLength; i++) { + final int cmp = compareUtf8UsingJavaStringOrdering(buf1.get(position1 + i), buf2.get(position2 + i)); + if (cmp != 0) { + return cmp; + } + } + + return Integer.compare(length1, length2); + } + + /** + * Compares two bytes from UTF-8 strings in such a way that the entire byte arrays are compared in UTF-16 + * code-unit order. + * + * Compatible with {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} and + * {@link #compareUtf8UsingJavaStringOrdering(ByteBuffer, int, int, ByteBuffer, int, int)}. + */ + public static int compareUtf8UsingJavaStringOrdering(byte byte1, byte byte2) + { + // Treat as unsigned bytes. + int ubyte1 = byte1 & 0xFF; + int ubyte2 = byte2 & 0xFF; + + if (ubyte1 != ubyte2 && ubyte1 >= 0xEE && ubyte2 >= 0xEE) { + // Fixup logic for lead bytes for U+E000 ... U+FFFF, based on logic described at + // https://www.icu-project.org/docs/papers/utf16_code_point_order.html. + // + // Move possible lead bytes for this range (0xEE and 0xEF) above all other bytes, so they compare later. + + if (ubyte1 == 0xEE || ubyte1 == 0xEF) { + ubyte1 += 0xFF; + } + + if (ubyte2 == 0xEE || ubyte2 == 0xEF) { + ubyte2 += 0xFF; + } + } + + return ubyte1 - ubyte2; + } + public static String fromUtf8(final byte[] bytes) { try { diff --git a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java index f5acb6e0308..c32f29bfdd3 100644 --- a/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java +++ b/core/src/test/java/org/apache/druid/java/util/common/ByteBufferUtilsTest.java @@ -19,6 +19,7 @@ package org.apache.druid.java.util.common; +import com.google.common.collect.ImmutableList; import com.google.common.io.Files; import org.apache.druid.collections.ResourceHolder; import org.hamcrest.MatcherAssert; @@ -36,9 +37,28 @@ import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.util.Arrays; import java.util.Comparator; +import java.util.List; public class ByteBufferUtilsTest { + private static final List COMPARE_TEST_STRINGS = ImmutableList.of( + "(請參見已被刪除版本)", + "請參見已被刪除版本", + "שָׁלוֹם", + "+{{[[Template:別名重定向|別名重定向]]}}", + "\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D", + "\uD83D\uDCA9", + "", + "f", + "fo", + "\uD83D\uDE42", + "\uD83E\uDEE5", + "\uD83E\uDD20", + "quick", + "brown", + "fox" + ); + @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); @@ -82,9 +102,9 @@ public class ByteBufferUtilsTest @Test @SuppressWarnings("EqualsWithItself") - public void testUnsignedComparator() + public void testUtf8Comparator() { - final Comparator comparator = ByteBufferUtils.unsignedComparator(); + final Comparator comparator = ByteBufferUtils.utf8Comparator(); // Tests involving null MatcherAssert.assertThat(comparator.compare(null, null), Matchers.equalTo(0)); @@ -112,18 +132,33 @@ public class ByteBufferUtilsTest Matchers.greaterThan(0) ); - // Tests involving the full range of bytes - for (byte i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) { - for (byte j = Byte.MIN_VALUE; j < Byte.MAX_VALUE; j++) { - final int cmp = Integer.compare(Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)); + for (final String string1 : COMPARE_TEST_STRINGS) { + for (final String string2 : COMPARE_TEST_STRINGS) { + final byte[] utf8Bytes1 = StringUtils.toUtf8(string1); + final byte[] utf8Bytes2 = StringUtils.toUtf8(string2); + final ByteBuffer utf8ByteBuffer1 = ByteBuffer.allocate(utf8Bytes1.length + 2); + final ByteBuffer utf8ByteBuffer2 = ByteBuffer.allocate(utf8Bytes2.length + 2); + utf8ByteBuffer1.position(1); + utf8ByteBuffer1.put(utf8Bytes1, 0, utf8Bytes1.length).position(utf8Bytes1.length); + utf8ByteBuffer1.position(1).limit(1 + utf8Bytes1.length); + utf8ByteBuffer2.position(1); + utf8ByteBuffer2.put(utf8Bytes2, 0, utf8Bytes2.length).position(utf8Bytes2.length); + utf8ByteBuffer2.position(1).limit(1 + utf8Bytes2.length); - MatcherAssert.assertThat( - StringUtils.format("comparison of %s to %s", Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)), - comparator.compare( - ByteBuffer.wrap(new byte[]{i}), - ByteBuffer.wrap(new byte[]{j}) + final int compareByteBufferUtilsUtf8 = ByteBufferUtils.utf8Comparator().compare( + utf8ByteBuffer1, + utf8ByteBuffer2 + ); + + Assert.assertEquals( + StringUtils.format( + "compareByteBufferUtilsUtf8(byte[]) (actual) " + + "matches compareJavaString (expected) for [%s] vs [%s]", + string1, + string2 ), - cmp < 0 ? Matchers.lessThan(0) : cmp > 0 ? Matchers.greaterThan(0) : Matchers.equalTo(0) + (int) Math.signum(string1.compareTo(string2)), + (int) Math.signum(compareByteBufferUtilsUtf8) ); } } diff --git a/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java b/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java index 754e7237c76..3f2d5713c2f 100644 --- a/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java +++ b/core/src/test/java/org/apache/druid/java/util/common/StringUtilsTest.java @@ -19,6 +19,7 @@ package org.apache.druid.java.util.common; +import com.google.common.collect.ImmutableList; import org.apache.druid.collections.ResourceHolder; import org.junit.Assert; import org.junit.Rule; @@ -28,12 +29,31 @@ import org.junit.rules.ExpectedException; import java.io.UnsupportedEncodingException; import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; +import java.util.List; /** * */ public class StringUtilsTest { + private static final List COMPARE_TEST_STRINGS = ImmutableList.of( + "(請參見已被刪除版本)", + "請參見已被刪除版本", + "שָׁלוֹם", + "+{{[[Template:別名重定向|別名重定向]]}}", + "\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D", + "\uD83D\uDCA9", + "", + "f", + "fo", + "\uD83D\uDE42", + "\uD83E\uDEE5", + "\uD83E\uDD20", + "quick", + "brown", + "fox" + ); + @Rule public ExpectedException expectedException = ExpectedException.none(); @@ -290,4 +310,80 @@ public class StringUtilsTest Assert.assertEquals("smile ", StringUtils.fastLooseChop("smile 🙂 for the camera", 6)); Assert.assertEquals("smile", StringUtils.fastLooseChop("smile 🙂 for the camera", 5)); } + + @Test + public void testUnicodeStringCompare() + { + for (final String string1 : COMPARE_TEST_STRINGS) { + for (final String string2 : COMPARE_TEST_STRINGS) { + final int compareUnicode = StringUtils.compareUnicode(string1, string2); + final int compareUtf8 = StringUtils.compareUtf8( + StringUtils.toUtf8(string1), + StringUtils.toUtf8(string2) + ); + + Assert.assertEquals( + StringUtils.format( + "compareUnicode (actual) matches compareUtf8 (expected) for [%s] vs [%s]", + string1, + string2 + ), + (int) Math.signum(compareUtf8), + (int) Math.signum(compareUnicode) + ); + } + } + } + + @Test + public void testJavaStringCompare() + { + for (final String string1 : COMPARE_TEST_STRINGS) { + for (final String string2 : COMPARE_TEST_STRINGS) { + final int compareJavaString = string1.compareTo(string2); + + final byte[] utf8Bytes1 = StringUtils.toUtf8(string1); + final byte[] utf8Bytes2 = StringUtils.toUtf8(string2); + final int compareByteArrayUtf8UsingJavaStringOrdering = + StringUtils.compareUtf8UsingJavaStringOrdering(utf8Bytes1, utf8Bytes2); + + final ByteBuffer utf8ByteBuffer1 = ByteBuffer.allocate(utf8Bytes1.length + 2); + final ByteBuffer utf8ByteBuffer2 = ByteBuffer.allocate(utf8Bytes2.length + 2); + utf8ByteBuffer1.position(1); + utf8ByteBuffer1.put(utf8Bytes1, 0, utf8Bytes1.length).position(utf8Bytes1.length); + utf8ByteBuffer2.position(1); + utf8ByteBuffer2.put(utf8Bytes2, 0, utf8Bytes2.length).position(utf8Bytes2.length); + final int compareByteBufferUtf8UsingJavaStringOrdering = StringUtils.compareUtf8UsingJavaStringOrdering( + utf8ByteBuffer1, + 1, + utf8Bytes1.length, + utf8ByteBuffer2, + 1, + utf8Bytes2.length + ); + + Assert.assertEquals( + StringUtils.format( + "compareUtf8UsingJavaStringOrdering(byte[]) (actual) " + + "matches compareJavaString (expected) for [%s] vs [%s]", + string1, + string2 + ), + (int) Math.signum(compareJavaString), + (int) Math.signum(compareByteArrayUtf8UsingJavaStringOrdering) + ); + + Assert.assertEquals( + StringUtils.format( + "compareByteBufferUtf8UsingJavaStringOrdering(ByteBuffer) (actual) " + + "matches compareJavaString (expected) for [%s] vs [%s]", + string1, + string2 + ), + (int) Math.signum(compareJavaString), + (int) Math.signum(compareByteBufferUtf8UsingJavaStringOrdering) + ); + } + } + } } diff --git a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java index 961e99a3f0d..ac6d9bfe651 100644 --- a/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java +++ b/processing/src/main/java/org/apache/druid/frame/write/FrameWriterUtils.java @@ -205,7 +205,7 @@ public class FrameWriterUtils /** * Copies "len" bytes from {@code src.position()} to "dstPosition" in "memory". Does not update the position of src. * - * @throws InvalidNullByteException "allowNullBytes" is true and a null byte is encountered + * @throws InvalidNullByteException if "allowNullBytes" is false and a null byte is encountered */ public static void copyByteBufferToMemory( final ByteBuffer src, diff --git a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java index 2dec044cf19..afddb0e42af 100644 --- a/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java +++ b/processing/src/main/java/org/apache/druid/query/filter/InDimFilter.java @@ -674,7 +674,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter public SortedSet toUtf8() { - final TreeSet valuesUtf8 = new TreeSet<>(ByteBufferUtils.unsignedComparator()); + final TreeSet valuesUtf8 = new TreeSet<>(ByteBufferUtils.utf8Comparator()); for (final String value : values) { if (value == null) { diff --git a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java index 58e228ad2c6..4fdcc5c6f3a 100644 --- a/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java +++ b/processing/src/main/java/org/apache/druid/query/ordering/StringComparators.java @@ -47,9 +47,15 @@ public class StringComparators public static final int STRLEN_CACHE_ID = 0x04; public static final int VERSION_CACHE_ID = 0x05; + /** + * Comparison using the natural comparator of {@link String}. + * + * Note that this is not equivalent to comparing UTF-8 byte arrays; see javadocs for + * {@link org.apache.druid.java.util.common.StringUtils#compareUnicode(String, String)} and + * {@link org.apache.druid.java.util.common.StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}. + */ public static class LexicographicComparator extends StringComparator { - // Equivalent to comparing UTF-8 encoded strings as byte arrays. private static final Ordering ORDERING = Ordering.from(String::compareTo).nullsFirst(); @Override diff --git a/processing/src/main/java/org/apache/druid/segment/IndexIO.java b/processing/src/main/java/org/apache/druid/segment/IndexIO.java index 9698ebdc2be..9b74f71768b 100644 --- a/processing/src/main/java/org/apache/druid/segment/IndexIO.java +++ b/processing/src/main/java/org/apache/druid/segment/IndexIO.java @@ -379,7 +379,7 @@ public class IndexIO // Duplicate the first buffer since we are reading the dictionary twice. dimValueLookups.put(dimension, GenericIndexed.read(dimBuffer.duplicate(), GenericIndexed.STRING_STRATEGY)); - dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY)); + dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, GenericIndexed.UTF8_STRATEGY)); dimColumns.put(dimension, VSizeColumnarMultiInts.readFromByteBuffer(dimBuffer)); } diff --git a/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java b/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java index 5680a1400f0..c568e78d9b0 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java +++ b/processing/src/main/java/org/apache/druid/segment/column/IndexedUtf8ValueSetIndex.java @@ -47,7 +47,7 @@ public final class IndexedUtf8ValueSetIndex COMPARATOR = ByteBufferUtils.unsignedComparator(); + private static final Comparator COMPARATOR = ByteBufferUtils.utf8Comparator(); private final BitmapFactory bitmapFactory; private final TDictionary dictionary; diff --git a/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java index ef0d08ee0a3..6598e36f206 100644 --- a/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java +++ b/processing/src/main/java/org/apache/druid/segment/column/Utf8ValueSetIndex.java @@ -29,7 +29,7 @@ public interface Utf8ValueSetIndex /** * Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the specified set of values (if they are * contained in the underlying column). The set must be sorted using - * {@link org.apache.druid.java.util.common.ByteBufferUtils#unsignedComparator()}. + * {@link org.apache.druid.java.util.common.ByteBufferUtils#utf8Comparator()}. */ BitmapColumnIndex forSortedValuesUtf8(SortedSet valuesUtf8); } diff --git a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java index d2d6c28d340..2596f7ec2bf 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java +++ b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexed.java @@ -23,6 +23,7 @@ import com.google.common.base.Preconditions; import com.google.common.base.Supplier; import org.apache.druid.common.config.NullHandling; import org.apache.druid.java.util.common.ISE; +import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector; import javax.annotation.Nullable; @@ -340,10 +341,13 @@ public final class FrontCodedIndexed implements Indexed /** - * Performs an unsigned byte comparison of the first value in a bucket with the specified value. Note that this method + * Performs byte-by-byte comparison of the first value in a bucket with the specified value. Note that this method * MUST be prepared before calling, as it expects the length of the first value to have already been read externally, * and the buffer position to be at the start of the first bucket value. The final buffer position will be the - * 'shared prefix length' of the first value in the bucket and the value to compare + * 'shared prefix length' of the first value in the bucket and the value to compare. + * + * Bytes are compared using {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte, byte)}. Therefore, when the + * values are UTF-8 encoded strings, the ordering is compatible with {@link String#compareTo(String)}. */ private static int compareBucketFirstValue(ByteBuffer bucketBuffer, int length, ByteBuffer value) { @@ -355,7 +359,7 @@ public final class FrontCodedIndexed implements Indexed int sharedPrefix; int comparison = 0; for (sharedPrefix = 0; sharedPrefix < commonLength; sharedPrefix++) { - comparison = unsignedByteCompare(bucketBuffer.get(), value.get(sharedPrefix)); + comparison = StringUtils.compareUtf8UsingJavaStringOrdering(bucketBuffer.get(), value.get(sharedPrefix)); if (comparison != 0) { bucketBuffer.position(startOffset + sharedPrefix); break; @@ -403,7 +407,10 @@ public final class FrontCodedIndexed implements Indexed final int common = Math.min(fragmentLength, value.remaining() - prefixLength); int fragmentComparison = 0; for (int i = 0; i < common; i++) { - fragmentComparison = unsignedByteCompare(buffer.get(buffer.position() + i), value.get(prefixLength + i)); + fragmentComparison = StringUtils.compareUtf8UsingJavaStringOrdering( + buffer.get(buffer.position() + i), + value.get(prefixLength + i) + ); if (fragmentComparison != 0) { break; } @@ -502,9 +509,4 @@ public final class FrontCodedIndexed implements Indexed } return bucketBuffers; } - - public static int unsignedByteCompare(byte b1, byte b2) - { - return (b1 & 0xFF) - (b2 & 0xFF); - } } diff --git a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java index b6120d6c123..bcbe47db624 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java +++ b/processing/src/main/java/org/apache/druid/segment/data/FrontCodedIndexedWriter.java @@ -44,8 +44,8 @@ import java.nio.channels.WritableByteChannel; * the bucket is written entirely, and remaining values are stored as pairs of an integer which indicates how much * of the first byte array of the bucket to use as a prefix, followed by the remaining value bytes after the prefix. * - * This is valid to use for any values which can be compared byte by byte with unsigned comparison. Otherwise, this - * is not the collection for you. + * This writer is designed for use with UTF-8 encoded strings that are written in an order compatible with + * {@link String#compareTo(String)}. * * @see FrontCodedIndexed for additional details. */ @@ -99,7 +99,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter @Override public void write(@Nullable byte[] value) throws IOException { - if (prevObject != null && unsignedCompare(prevObject, value) >= 0) { + if (prevObject != null && compareNullableUtf8UsingJavaStringOrdering(prevObject, value) >= 0) { throw new ISE( "Values must be sorted and unique. Element [%s] with value [%s] is before or equivalent to [%s]", numWritten, @@ -283,7 +283,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter // all other values must be partitioned into a prefix length and suffix bytes int prefixLength = 0; for (; prefixLength < first.length; prefixLength++) { - final int cmp = FrontCodedIndexed.unsignedByteCompare(first[prefixLength], next[prefixLength]); + final int cmp = StringUtils.compareUtf8UsingJavaStringOrdering(first[prefixLength], next[prefixLength]); if (cmp != 0) { break; } @@ -325,7 +325,11 @@ public class FrontCodedIndexedWriter implements DictionaryWriter return buffer.position() - pos; } - public static int unsignedCompare( + /** + * Same as {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}, but accepts nulls. Nulls are + * sorted first. + */ + private static int compareNullableUtf8UsingJavaStringOrdering( @Nullable final byte[] b1, @Nullable final byte[] b2 ) @@ -337,15 +341,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter if (b2 == null) { return 1; } - final int commonLength = Math.min(b1.length, b2.length); - for (int i = 0; i < commonLength; i++) { - final int cmp = FrontCodedIndexed.unsignedByteCompare(b1[i], b2[i]); - if (cmp != 0) { - return cmp; - } - } - - return Integer.compare(b1.length, b2.length); + return StringUtils.compareUtf8UsingJavaStringOrdering(b1, b2); } } diff --git a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java index ff1c570b03a..62f50b0dc60 100644 --- a/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java +++ b/processing/src/main/java/org/apache/druid/segment/data/GenericIndexed.java @@ -100,13 +100,16 @@ public class GenericIndexed implements CloseableIndexed, Serializer private static final SerializerUtils SERIALIZER_UTILS = new SerializerUtils(); /** - * An ObjectStrategy that returns a big-endian ByteBuffer pointing to the original data. + * An ObjectStrategy that returns a big-endian ByteBuffer pointing to original data. * * The returned ByteBuffer is a fresh read-only instance, so it is OK for callers to modify its position, limit, etc. * However, it does point to the original data, so callers must take care not to use it if the original data may * have been freed. + * + * The compare method of this instance uses {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])} + * so that behavior is consistent with {@link #STRING_STRATEGY}. */ - public static final ObjectStrategy BYTE_BUFFER_STRATEGY = new ObjectStrategy() + public static final ObjectStrategy UTF8_STRATEGY = new ObjectStrategy() { @Override public Class getClazz() @@ -140,7 +143,7 @@ public class GenericIndexed implements CloseableIndexed, Serializer @Override public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2) { - return ByteBufferUtils.unsignedComparator().compare(o1, o2); + return ByteBufferUtils.utf8Comparator().compare(o1, o2); } }; @@ -541,7 +544,7 @@ public class GenericIndexed implements CloseableIndexed, Serializer } //noinspection ObjectEquality - final boolean isByteBufferStrategy = strategy == BYTE_BUFFER_STRATEGY; + final boolean isByteBufferStrategy = strategy == UTF8_STRATEGY; int minIndex = 0; int maxIndex = size - 1; @@ -553,7 +556,7 @@ public class GenericIndexed implements CloseableIndexed, Serializer if (isByteBufferStrategy) { // Specialization avoids ByteBuffer allocation in strategy.fromByteBuffer. ByteBuffer currValue = getByteBuffer(currIndex); - comparison = ByteBufferUtils.compareByteBuffers(currValue, (ByteBuffer) value); + comparison = ByteBufferUtils.compareUtf8ByteBuffers(currValue, (ByteBuffer) value); } else { T currValue = get(currIndex); comparison = strategy.compare(currValue, value); diff --git a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java index 8594cab8cab..39d8cf081bd 100644 --- a/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java +++ b/processing/src/main/java/org/apache/druid/segment/nested/NestedDataColumnSupplier.java @@ -107,7 +107,7 @@ public class NestedDataColumnSupplier implements Supplier // this cannot happen naturally right now since generic indexed is written in the 'legacy' format, but // this provides backwards compatibility should we switch at some point in the future to always // writing dictionaryVersion - dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY, mapper); + dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper); frontCodedDictionarySupplier = null; } else { throw new ISE("impossible, unknown encoding strategy id: %s", encodingId); @@ -117,7 +117,7 @@ public class NestedDataColumnSupplier implements Supplier // as dictionaryVersion is actually also the GenericIndexed version, so we reset start position so the // GenericIndexed version can be correctly read stringDictionaryBuffer.position(dictionaryStartPosition); - dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY, mapper); + dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper); frontCodedDictionarySupplier = null; } final ByteBuffer longDictionaryBuffer = loadInternalFile( diff --git a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java index ef130f5428a..b1eef3307b6 100644 --- a/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java +++ b/processing/src/main/java/org/apache/druid/segment/serde/DictionaryEncodedColumnPartSerde.java @@ -353,7 +353,7 @@ public class DictionaryEncodedColumnPartSerde implements ColumnPartSerde final GenericIndexed rDictionaryUtf8 = GenericIndexed.read( buffer, - GenericIndexed.BYTE_BUFFER_STRATEGY, + GenericIndexed.UTF8_STRATEGY, builder.getFileMapper() ); diff --git a/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java b/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java index f1bd478c954..c9e56135365 100644 --- a/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java +++ b/processing/src/test/java/org/apache/druid/segment/data/FrontCodedIndexedTest.java @@ -241,7 +241,9 @@ public class FrontCodedIndexedTest extends InitializedNullHandlingTest public void testFrontCodedIndexedUnicodes() throws IOException { ByteBuffer buffer = ByteBuffer.allocate(1 << 12).order(order); - List theList = ImmutableList.of("Győ-Moson-Sopron", "Győr"); + + // "\uD83D\uDCA9" and "(請參見已被刪除版本)" are a regression test for https://github.com/apache/druid/pull/13364 + List theList = ImmutableList.of("Győ-Moson-Sopron", "Győr", "\uD83D\uDCA9", "(請參見已被刪除版本)"); fillBuffer(buffer, theList, 4); buffer.position(0); diff --git a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java index 268a4a2c971..eaf161036da 100644 --- a/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java +++ b/processing/src/test/java/org/apache/druid/segment/filter/ExtractionDimFilterTest.java @@ -121,7 +121,7 @@ public class ExtractionDimFilterTest extends InitializedNullHandlingTest GenericIndexed.fromIterable(Collections.singletonList("foo1"), GenericIndexed.STRING_STRATEGY), GenericIndexed.fromIterable( Collections.singletonList(ByteBuffer.wrap(StringUtils.toUtf8("foo1"))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ), GenericIndexed.fromIterable(Collections.singletonList(foo1BitMap), serdeFactory.getObjectStrategy()), null diff --git a/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java b/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java index e10d64241f2..f7525e3f9c4 100644 --- a/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java +++ b/processing/src/test/java/org/apache/druid/segment/filter/PredicateValueMatcherFactoryTest.java @@ -75,7 +75,7 @@ public class PredicateValueMatcherFactoryTest extends InitializedNullHandlingTes ByteBuffer.wrap(StringUtils.toUtf8("v2")), ByteBuffer.wrap(StringUtils.toUtf8("v3")) ), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ), null, () -> VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new int[]{1}))), @@ -98,7 +98,7 @@ public class PredicateValueMatcherFactoryTest extends InitializedNullHandlingTes ByteBuffer.wrap(StringUtils.toUtf8("v2")), ByteBuffer.wrap(StringUtils.toUtf8("v3")) ), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ), null, () -> VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new int[]{1}))), diff --git a/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java b/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java index cb5459226b2..98631d3dc9c 100644 --- a/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java +++ b/processing/src/test/java/org/apache/druid/segment/filter/ValueMatchersTest.java @@ -49,7 +49,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest GenericIndexed.fromIterable(ImmutableList.of("value"), GenericIndexed.STRING_STRATEGY), GenericIndexed.fromIterable( ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ), () -> VSizeColumnarInts.fromArray(new int[]{0}), null, @@ -62,7 +62,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest ByteBuffer.wrap(StringUtils.toUtf8("value")), ByteBuffer.wrap(StringUtils.toUtf8("value2")) ), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ), () -> VSizeColumnarInts.fromArray(new int[]{0, 0, 1, 0, 1}), null, @@ -72,7 +72,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest GenericIndexed.fromIterable(ImmutableList.of("value"), GenericIndexed.STRING_STRATEGY), GenericIndexed.fromIterable( ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))), - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ), null, () -> VSizeColumnarMultiInts.fromIterable( diff --git a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java index 6d66464eee8..15b40b729f0 100644 --- a/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java +++ b/processing/src/test/java/org/apache/druid/segment/nested/NestedFieldLiteralColumnIndexSupplierTest.java @@ -127,7 +127,7 @@ public class NestedFieldLiteralColumnIndexSupplierTest extends InitializedNullHa doubleWriter.write(9.9); writeToBuffer(doubleBuffer, doubleWriter); - GenericIndexed strings = GenericIndexed.read(stringBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY); + GenericIndexed strings = GenericIndexed.read(stringBuffer, GenericIndexed.UTF8_STRATEGY); globalStrings = () -> strings.singleThreaded(); globalLongs = FixedIndexed.read(longBuffer, TypeStrategies.LONG, ByteOrder.nativeOrder(), Long.BYTES); globalDoubles = FixedIndexed.read(doubleBuffer, TypeStrategies.DOUBLE, ByteOrder.nativeOrder(), Double.BYTES); diff --git a/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java b/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java index 36d5ba76a05..a9ab64a378a 100644 --- a/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java +++ b/processing/src/test/java/org/apache/druid/segment/serde/DictionaryEncodedStringIndexSupplierTest.java @@ -115,7 +115,7 @@ public class DictionaryEncodedStringIndexSupplierTest extends InitializedNullHan GenericIndexedWriter byteBufferWriter = new GenericIndexedWriter<>( new OnHeapMemorySegmentWriteOutMedium(), "byteBuffers", - GenericIndexed.BYTE_BUFFER_STRATEGY + GenericIndexed.UTF8_STRATEGY ); stringWriter.open(); @@ -167,7 +167,7 @@ public class DictionaryEncodedStringIndexSupplierTest extends InitializedNullHan return new DictionaryEncodedStringIndexSupplier( roaringFactory.getBitmapFactory(), GenericIndexed.read(stringBuffer, GenericIndexed.STRING_STRATEGY), - GenericIndexed.read(byteBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY), + GenericIndexed.read(byteBuffer, GenericIndexed.UTF8_STRATEGY), bitmaps, null );