mirror of https://github.com/apache/druid.git
Add string comparison methods to StringUtils, fix dictionary comparisons. (#13364)
* Add string comparison methods to StringUtils, fix dictionary comparisons. There are various places in Druid code where we assume that String.compareTo is consistent with Unicode code-point ordering. Sadly this is not the case. To help deal with this, this patch introduces the following helpers: 1) compareUnicode: Compares two Strings in Unicode code-point order. 2) compareUtf8: Compares two UTF-8 byte arrays in Unicode code-point order. Equivalent to comparison as unsigned bytes. 3) compareUtf8UsingJavaStringOrdering: Compares two UTF-8 byte arrays, or ByteBuffers, in a manner consistent with String.compareTo. There is no helper for comparing two Strings in a manner consistent with String.compareTo, because for that we can use compareTo directly. The patch also fixes an inconsistency between the String and UTF-8 dictionary GenericIndexed flavors of string-typed columns: they were formerly using incompatible comparators. * Adjust test. * FrontCodedIndexed updates. * Add test. * Fix comments.
This commit is contained in:
parent
71b133f3ff
commit
78d0b0abce
|
@ -179,7 +179,7 @@ public class BoundFilterBenchmark
|
|||
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
|
||||
FluentIterable.from(ints)
|
||||
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
);
|
||||
selector = new MockColumnIndexSelector(
|
||||
bitmapFactory,
|
||||
|
|
|
@ -101,7 +101,7 @@ public class DictionaryEncodedStringIndexSupplierBenchmark
|
|||
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
|
||||
FluentIterable.from(ints)
|
||||
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
);
|
||||
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
|
||||
() -> IntStream.range(0, dictionarySize)
|
||||
|
|
|
@ -130,7 +130,7 @@ public class DimensionPredicateFilterBenchmark
|
|||
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
|
||||
FluentIterable.from(ints)
|
||||
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
);
|
||||
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
|
||||
FluentIterable.from(ints)
|
||||
|
|
|
@ -174,7 +174,7 @@ public class FrontCodedIndexedBenchmark
|
|||
|
||||
genericIndexed = GenericIndexed.read(
|
||||
byteBufferGeneric,
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY,
|
||||
GenericIndexed.UTF8_STRATEGY,
|
||||
SmooshedFileMapper.load(smooshDirFrontCoded)
|
||||
);
|
||||
frontCodedIndexed = FrontCodedIndexed.read(
|
||||
|
|
|
@ -93,7 +93,7 @@ public class InFilterBenchmark
|
|||
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
|
||||
FluentIterable.from(ints)
|
||||
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
);
|
||||
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
|
||||
() -> IntStream.range(0, dictionarySize)
|
||||
|
|
|
@ -130,7 +130,7 @@ public class LikeFilterBenchmark
|
|||
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
|
||||
FluentIterable.from(ints)
|
||||
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
);
|
||||
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
|
||||
FluentIterable.from(ints)
|
||||
|
|
|
@ -47,7 +47,7 @@ public class ByteBufferUtils
|
|||
// null if unmap is supported
|
||||
private static final RuntimeException UNMAP_NOT_SUPPORTED_EXCEPTION;
|
||||
|
||||
private static final Comparator<ByteBuffer> COMPARATOR_UNSIGNED = new UnsignedByteBufferComparator();
|
||||
private static final Comparator<ByteBuffer> COMPARATOR_UTF8 = new Utf8ByteBufferComparator();
|
||||
|
||||
static {
|
||||
Object unmap = null;
|
||||
|
@ -214,40 +214,12 @@ public class ByteBufferUtils
|
|||
}
|
||||
|
||||
/**
|
||||
* Compares two ByteBuffer ranges using unsigned byte ordering.
|
||||
* Compares two ByteBuffers from their positions to their limits using ordering consistent with
|
||||
* {@link String#compareTo(String)}. Null buffers are accepted, and are ordered earlier than any nonnull buffer.
|
||||
*
|
||||
* Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
|
||||
* Different from {@link ByteBuffer#compareTo}, which uses signed-bytes ordering.
|
||||
*/
|
||||
public static int compareByteBuffers(
|
||||
final ByteBuffer buf1,
|
||||
final int position1,
|
||||
final int length1,
|
||||
final ByteBuffer buf2,
|
||||
final int position2,
|
||||
final int length2
|
||||
)
|
||||
{
|
||||
final int commonLength = Math.min(length1, length2);
|
||||
|
||||
for (int i = 0; i < commonLength; i++) {
|
||||
final byte byte1 = buf1.get(position1 + i);
|
||||
final byte byte2 = buf2.get(position2 + i);
|
||||
final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
|
||||
return Integer.compare(length1, length2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two ByteBuffers from their positions to their limits using unsigned byte ordering. Accepts null
|
||||
* buffers, which are ordered earlier than any nonnull buffer.
|
||||
*
|
||||
* Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
|
||||
*/
|
||||
public static int compareByteBuffers(
|
||||
public static int compareUtf8ByteBuffers(
|
||||
@Nullable final ByteBuffer buf1,
|
||||
@Nullable final ByteBuffer buf2
|
||||
)
|
||||
|
@ -260,7 +232,7 @@ public class ByteBufferUtils
|
|||
return 1;
|
||||
}
|
||||
|
||||
return ByteBufferUtils.compareByteBuffers(
|
||||
return StringUtils.compareUtf8UsingJavaStringOrdering(
|
||||
buf1,
|
||||
buf1.position(),
|
||||
buf1.remaining(),
|
||||
|
@ -271,20 +243,20 @@ public class ByteBufferUtils
|
|||
}
|
||||
|
||||
/**
|
||||
* Comparator that compares two {@link ByteBuffer} using unsigned ordering. Null buffers are accepted, and
|
||||
* are ordered earlier than any nonnull buffer.
|
||||
* Comparator that compares two {@link ByteBuffer} using ordering consistent with {@link String#compareTo(String)}.
|
||||
* Null buffers are accepted, and are ordered earlier than any nonnull buffer.
|
||||
*/
|
||||
public static Comparator<ByteBuffer> unsignedComparator()
|
||||
public static Comparator<ByteBuffer> utf8Comparator()
|
||||
{
|
||||
return COMPARATOR_UNSIGNED;
|
||||
return COMPARATOR_UTF8;
|
||||
}
|
||||
|
||||
private static class UnsignedByteBufferComparator implements Comparator<ByteBuffer>
|
||||
private static class Utf8ByteBufferComparator implements Comparator<ByteBuffer>
|
||||
{
|
||||
@Override
|
||||
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
|
||||
{
|
||||
return ByteBufferUtils.compareByteBuffers(o1, o2);
|
||||
return compareUtf8ByteBuffers(o1, o2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,6 +77,148 @@ public class StringUtils
|
|||
return string == null ? EMPTY_BYTES : toUtf8(string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two Java Strings in Unicode code-point order.
|
||||
*
|
||||
* Order is consistent with {@link #compareUtf8(byte[], byte[])}, but is not consistent with
|
||||
* {@link String#compareTo(String)}.
|
||||
*/
|
||||
public static int compareUnicode(final String a, final String b)
|
||||
{
|
||||
final int commonLength = Math.min(a.length(), b.length());
|
||||
|
||||
for (int i = 0; i < commonLength; i++) {
|
||||
int char1 = a.charAt(i) & 0xFFFF; // Unsigned
|
||||
int char2 = b.charAt(i) & 0xFFFF; // Unsigned
|
||||
|
||||
if (char1 != char2 && char1 >= 0xd800 && char2 >= 0xd800) {
|
||||
// Fixup logic for code units at or above the surrogate range, based on logic described at
|
||||
// https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
|
||||
//
|
||||
// If both code units are at or above the surrogate range (>= 0xd800) then adjust non-surrogates (legitimate
|
||||
// single-code-unit characters) to be below the surrogate range, so they compare earlier than surrogates.
|
||||
|
||||
if (!Character.isSurrogate((char) char1)) {
|
||||
char1 -= 0x2800;
|
||||
}
|
||||
|
||||
if (!Character.isSurrogate((char) char2)) {
|
||||
char2 -= 0x2800;
|
||||
}
|
||||
}
|
||||
|
||||
final int cmp = char1 - char2;
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
|
||||
return Integer.compare(a.length(), b.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two UTF-8 byte strings in Unicode code-point order.
|
||||
*
|
||||
* Equivalent to a comparison of the two byte arrays as if they were unsigned bytes.
|
||||
*
|
||||
* Order is consistent with {@link #compareUnicode(String, String)}, but is not consistent with
|
||||
* {@link String#compareTo(String)}. For an ordering consistent with {@link String#compareTo(String)}, use
|
||||
* {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} instead.
|
||||
*/
|
||||
public static int compareUtf8(final byte[] a, final byte[] b)
|
||||
{
|
||||
final int commonLength = Math.min(a.length, b.length);
|
||||
|
||||
for (int i = 0; i < commonLength; i++) {
|
||||
final byte byte1 = a[i];
|
||||
final byte byte2 = b[i];
|
||||
final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
|
||||
return Integer.compare(a.length, b.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two UTF-8 byte strings in UTF-16 code-unit order.
|
||||
*
|
||||
* Order is consistent with {@link String#compareTo(String)}, but is not consistent with
|
||||
* {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], byte[])}.
|
||||
*/
|
||||
public static int compareUtf8UsingJavaStringOrdering(final byte[] a, final byte[] b)
|
||||
{
|
||||
final int commonLength = Math.min(a.length, b.length);
|
||||
|
||||
for (int i = 0; i < commonLength; i++) {
|
||||
final int cmp = compareUtf8UsingJavaStringOrdering(a[i], b[i]);
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
|
||||
return Integer.compare(a.length, b.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two UTF-8 byte strings in UTF-16 code-unit order.
|
||||
*
|
||||
* Order is consistent with {@link String#compareTo(String)}, but is not consistent with
|
||||
* {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], byte[])}.
|
||||
*/
|
||||
public static int compareUtf8UsingJavaStringOrdering(
|
||||
final ByteBuffer buf1,
|
||||
final int position1,
|
||||
final int length1,
|
||||
final ByteBuffer buf2,
|
||||
final int position2,
|
||||
final int length2
|
||||
)
|
||||
{
|
||||
final int commonLength = Math.min(length1, length2);
|
||||
|
||||
for (int i = 0; i < commonLength; i++) {
|
||||
final int cmp = compareUtf8UsingJavaStringOrdering(buf1.get(position1 + i), buf2.get(position2 + i));
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
|
||||
return Integer.compare(length1, length2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares two bytes from UTF-8 strings in such a way that the entire byte arrays are compared in UTF-16
|
||||
* code-unit order.
|
||||
*
|
||||
* Compatible with {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} and
|
||||
* {@link #compareUtf8UsingJavaStringOrdering(ByteBuffer, int, int, ByteBuffer, int, int)}.
|
||||
*/
|
||||
public static int compareUtf8UsingJavaStringOrdering(byte byte1, byte byte2)
|
||||
{
|
||||
// Treat as unsigned bytes.
|
||||
int ubyte1 = byte1 & 0xFF;
|
||||
int ubyte2 = byte2 & 0xFF;
|
||||
|
||||
if (ubyte1 != ubyte2 && ubyte1 >= 0xEE && ubyte2 >= 0xEE) {
|
||||
// Fixup logic for lead bytes for U+E000 ... U+FFFF, based on logic described at
|
||||
// https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
|
||||
//
|
||||
// Move possible lead bytes for this range (0xEE and 0xEF) above all other bytes, so they compare later.
|
||||
|
||||
if (ubyte1 == 0xEE || ubyte1 == 0xEF) {
|
||||
ubyte1 += 0xFF;
|
||||
}
|
||||
|
||||
if (ubyte2 == 0xEE || ubyte2 == 0xEF) {
|
||||
ubyte2 += 0xFF;
|
||||
}
|
||||
}
|
||||
|
||||
return ubyte1 - ubyte2;
|
||||
}
|
||||
|
||||
public static String fromUtf8(final byte[] bytes)
|
||||
{
|
||||
try {
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.apache.druid.java.util.common;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.io.Files;
|
||||
import org.apache.druid.collections.ResourceHolder;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
|
@ -36,9 +37,28 @@ import java.nio.ByteBuffer;
|
|||
import java.nio.MappedByteBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public class ByteBufferUtilsTest
|
||||
{
|
||||
private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
|
||||
"(請參見已被刪除版本)",
|
||||
"請參見已被刪除版本",
|
||||
"שָׁלוֹם",
|
||||
"+{{[[Template:別名重定向|別名重定向]]}}",
|
||||
"\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
|
||||
"\uD83D\uDCA9",
|
||||
"",
|
||||
"f",
|
||||
"fo",
|
||||
"\uD83D\uDE42",
|
||||
"\uD83E\uDEE5",
|
||||
"\uD83E\uDD20",
|
||||
"quick",
|
||||
"brown",
|
||||
"fox"
|
||||
);
|
||||
|
||||
@Rule
|
||||
public TemporaryFolder temporaryFolder = new TemporaryFolder();
|
||||
|
||||
|
@ -82,9 +102,9 @@ public class ByteBufferUtilsTest
|
|||
|
||||
@Test
|
||||
@SuppressWarnings("EqualsWithItself")
|
||||
public void testUnsignedComparator()
|
||||
public void testUtf8Comparator()
|
||||
{
|
||||
final Comparator<ByteBuffer> comparator = ByteBufferUtils.unsignedComparator();
|
||||
final Comparator<ByteBuffer> comparator = ByteBufferUtils.utf8Comparator();
|
||||
|
||||
// Tests involving null
|
||||
MatcherAssert.assertThat(comparator.compare(null, null), Matchers.equalTo(0));
|
||||
|
@ -112,18 +132,33 @@ public class ByteBufferUtilsTest
|
|||
Matchers.greaterThan(0)
|
||||
);
|
||||
|
||||
// Tests involving the full range of bytes
|
||||
for (byte i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) {
|
||||
for (byte j = Byte.MIN_VALUE; j < Byte.MAX_VALUE; j++) {
|
||||
final int cmp = Integer.compare(Byte.toUnsignedInt(i), Byte.toUnsignedInt(j));
|
||||
for (final String string1 : COMPARE_TEST_STRINGS) {
|
||||
for (final String string2 : COMPARE_TEST_STRINGS) {
|
||||
final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
|
||||
final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
|
||||
final ByteBuffer utf8ByteBuffer1 = ByteBuffer.allocate(utf8Bytes1.length + 2);
|
||||
final ByteBuffer utf8ByteBuffer2 = ByteBuffer.allocate(utf8Bytes2.length + 2);
|
||||
utf8ByteBuffer1.position(1);
|
||||
utf8ByteBuffer1.put(utf8Bytes1, 0, utf8Bytes1.length).position(utf8Bytes1.length);
|
||||
utf8ByteBuffer1.position(1).limit(1 + utf8Bytes1.length);
|
||||
utf8ByteBuffer2.position(1);
|
||||
utf8ByteBuffer2.put(utf8Bytes2, 0, utf8Bytes2.length).position(utf8Bytes2.length);
|
||||
utf8ByteBuffer2.position(1).limit(1 + utf8Bytes2.length);
|
||||
|
||||
MatcherAssert.assertThat(
|
||||
StringUtils.format("comparison of %s to %s", Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)),
|
||||
comparator.compare(
|
||||
ByteBuffer.wrap(new byte[]{i}),
|
||||
ByteBuffer.wrap(new byte[]{j})
|
||||
final int compareByteBufferUtilsUtf8 = ByteBufferUtils.utf8Comparator().compare(
|
||||
utf8ByteBuffer1,
|
||||
utf8ByteBuffer2
|
||||
);
|
||||
|
||||
Assert.assertEquals(
|
||||
StringUtils.format(
|
||||
"compareByteBufferUtilsUtf8(byte[]) (actual) "
|
||||
+ "matches compareJavaString (expected) for [%s] vs [%s]",
|
||||
string1,
|
||||
string2
|
||||
),
|
||||
cmp < 0 ? Matchers.lessThan(0) : cmp > 0 ? Matchers.greaterThan(0) : Matchers.equalTo(0)
|
||||
(int) Math.signum(string1.compareTo(string2)),
|
||||
(int) Math.signum(compareByteBufferUtilsUtf8)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.apache.druid.java.util.common;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.apache.druid.collections.ResourceHolder;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Rule;
|
||||
|
@ -28,12 +29,31 @@ import org.junit.rules.ExpectedException;
|
|||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.BufferUnderflowException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class StringUtilsTest
|
||||
{
|
||||
private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
|
||||
"(請參見已被刪除版本)",
|
||||
"請參見已被刪除版本",
|
||||
"שָׁלוֹם",
|
||||
"+{{[[Template:別名重定向|別名重定向]]}}",
|
||||
"\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
|
||||
"\uD83D\uDCA9",
|
||||
"",
|
||||
"f",
|
||||
"fo",
|
||||
"\uD83D\uDE42",
|
||||
"\uD83E\uDEE5",
|
||||
"\uD83E\uDD20",
|
||||
"quick",
|
||||
"brown",
|
||||
"fox"
|
||||
);
|
||||
|
||||
@Rule
|
||||
public ExpectedException expectedException = ExpectedException.none();
|
||||
|
||||
|
@ -290,4 +310,80 @@ public class StringUtilsTest
|
|||
Assert.assertEquals("smile ", StringUtils.fastLooseChop("smile 🙂 for the camera", 6));
|
||||
Assert.assertEquals("smile", StringUtils.fastLooseChop("smile 🙂 for the camera", 5));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeStringCompare()
|
||||
{
|
||||
for (final String string1 : COMPARE_TEST_STRINGS) {
|
||||
for (final String string2 : COMPARE_TEST_STRINGS) {
|
||||
final int compareUnicode = StringUtils.compareUnicode(string1, string2);
|
||||
final int compareUtf8 = StringUtils.compareUtf8(
|
||||
StringUtils.toUtf8(string1),
|
||||
StringUtils.toUtf8(string2)
|
||||
);
|
||||
|
||||
Assert.assertEquals(
|
||||
StringUtils.format(
|
||||
"compareUnicode (actual) matches compareUtf8 (expected) for [%s] vs [%s]",
|
||||
string1,
|
||||
string2
|
||||
),
|
||||
(int) Math.signum(compareUtf8),
|
||||
(int) Math.signum(compareUnicode)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJavaStringCompare()
|
||||
{
|
||||
for (final String string1 : COMPARE_TEST_STRINGS) {
|
||||
for (final String string2 : COMPARE_TEST_STRINGS) {
|
||||
final int compareJavaString = string1.compareTo(string2);
|
||||
|
||||
final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
|
||||
final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
|
||||
final int compareByteArrayUtf8UsingJavaStringOrdering =
|
||||
StringUtils.compareUtf8UsingJavaStringOrdering(utf8Bytes1, utf8Bytes2);
|
||||
|
||||
final ByteBuffer utf8ByteBuffer1 = ByteBuffer.allocate(utf8Bytes1.length + 2);
|
||||
final ByteBuffer utf8ByteBuffer2 = ByteBuffer.allocate(utf8Bytes2.length + 2);
|
||||
utf8ByteBuffer1.position(1);
|
||||
utf8ByteBuffer1.put(utf8Bytes1, 0, utf8Bytes1.length).position(utf8Bytes1.length);
|
||||
utf8ByteBuffer2.position(1);
|
||||
utf8ByteBuffer2.put(utf8Bytes2, 0, utf8Bytes2.length).position(utf8Bytes2.length);
|
||||
final int compareByteBufferUtf8UsingJavaStringOrdering = StringUtils.compareUtf8UsingJavaStringOrdering(
|
||||
utf8ByteBuffer1,
|
||||
1,
|
||||
utf8Bytes1.length,
|
||||
utf8ByteBuffer2,
|
||||
1,
|
||||
utf8Bytes2.length
|
||||
);
|
||||
|
||||
Assert.assertEquals(
|
||||
StringUtils.format(
|
||||
"compareUtf8UsingJavaStringOrdering(byte[]) (actual) "
|
||||
+ "matches compareJavaString (expected) for [%s] vs [%s]",
|
||||
string1,
|
||||
string2
|
||||
),
|
||||
(int) Math.signum(compareJavaString),
|
||||
(int) Math.signum(compareByteArrayUtf8UsingJavaStringOrdering)
|
||||
);
|
||||
|
||||
Assert.assertEquals(
|
||||
StringUtils.format(
|
||||
"compareByteBufferUtf8UsingJavaStringOrdering(ByteBuffer) (actual) "
|
||||
+ "matches compareJavaString (expected) for [%s] vs [%s]",
|
||||
string1,
|
||||
string2
|
||||
),
|
||||
(int) Math.signum(compareJavaString),
|
||||
(int) Math.signum(compareByteBufferUtf8UsingJavaStringOrdering)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -205,7 +205,7 @@ public class FrameWriterUtils
|
|||
/**
|
||||
* Copies "len" bytes from {@code src.position()} to "dstPosition" in "memory". Does not update the position of src.
|
||||
*
|
||||
* @throws InvalidNullByteException "allowNullBytes" is true and a null byte is encountered
|
||||
* @throws InvalidNullByteException if "allowNullBytes" is false and a null byte is encountered
|
||||
*/
|
||||
public static void copyByteBufferToMemory(
|
||||
final ByteBuffer src,
|
||||
|
|
|
@ -674,7 +674,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
|
|||
|
||||
public SortedSet<ByteBuffer> toUtf8()
|
||||
{
|
||||
final TreeSet<ByteBuffer> valuesUtf8 = new TreeSet<>(ByteBufferUtils.unsignedComparator());
|
||||
final TreeSet<ByteBuffer> valuesUtf8 = new TreeSet<>(ByteBufferUtils.utf8Comparator());
|
||||
|
||||
for (final String value : values) {
|
||||
if (value == null) {
|
||||
|
|
|
@ -47,9 +47,15 @@ public class StringComparators
|
|||
public static final int STRLEN_CACHE_ID = 0x04;
|
||||
public static final int VERSION_CACHE_ID = 0x05;
|
||||
|
||||
/**
|
||||
* Comparison using the natural comparator of {@link String}.
|
||||
*
|
||||
* Note that this is not equivalent to comparing UTF-8 byte arrays; see javadocs for
|
||||
* {@link org.apache.druid.java.util.common.StringUtils#compareUnicode(String, String)} and
|
||||
* {@link org.apache.druid.java.util.common.StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}.
|
||||
*/
|
||||
public static class LexicographicComparator extends StringComparator
|
||||
{
|
||||
// Equivalent to comparing UTF-8 encoded strings as byte arrays.
|
||||
private static final Ordering<String> ORDERING = Ordering.from(String::compareTo).nullsFirst();
|
||||
|
||||
@Override
|
||||
|
|
|
@ -379,7 +379,7 @@ public class IndexIO
|
|||
|
||||
// Duplicate the first buffer since we are reading the dictionary twice.
|
||||
dimValueLookups.put(dimension, GenericIndexed.read(dimBuffer.duplicate(), GenericIndexed.STRING_STRATEGY));
|
||||
dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY));
|
||||
dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, GenericIndexed.UTF8_STRATEGY));
|
||||
dimColumns.put(dimension, VSizeColumnarMultiInts.readFromByteBuffer(dimBuffer));
|
||||
}
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ public final class IndexedUtf8ValueSetIndex<TDictionary extends Indexed<ByteBuff
|
|||
// sorted merge instead of binary-search based algorithm.
|
||||
private static final double SORTED_MERGE_RATIO_THRESHOLD = 0.12D;
|
||||
private static final int SIZE_WORTH_CHECKING_MIN = 8;
|
||||
private static final Comparator<ByteBuffer> COMPARATOR = ByteBufferUtils.unsignedComparator();
|
||||
private static final Comparator<ByteBuffer> COMPARATOR = ByteBufferUtils.utf8Comparator();
|
||||
|
||||
private final BitmapFactory bitmapFactory;
|
||||
private final TDictionary dictionary;
|
||||
|
|
|
@ -29,7 +29,7 @@ public interface Utf8ValueSetIndex
|
|||
/**
|
||||
* Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the specified set of values (if they are
|
||||
* contained in the underlying column). The set must be sorted using
|
||||
* {@link org.apache.druid.java.util.common.ByteBufferUtils#unsignedComparator()}.
|
||||
* {@link org.apache.druid.java.util.common.ByteBufferUtils#utf8Comparator()}.
|
||||
*/
|
||||
BitmapColumnIndex forSortedValuesUtf8(SortedSet<ByteBuffer> valuesUtf8);
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import com.google.common.base.Preconditions;
|
|||
import com.google.common.base.Supplier;
|
||||
import org.apache.druid.common.config.NullHandling;
|
||||
import org.apache.druid.java.util.common.ISE;
|
||||
import org.apache.druid.java.util.common.StringUtils;
|
||||
import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
@ -340,10 +341,13 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
|
|||
|
||||
|
||||
/**
|
||||
* Performs an unsigned byte comparison of the first value in a bucket with the specified value. Note that this method
|
||||
* Performs byte-by-byte comparison of the first value in a bucket with the specified value. Note that this method
|
||||
* MUST be prepared before calling, as it expects the length of the first value to have already been read externally,
|
||||
* and the buffer position to be at the start of the first bucket value. The final buffer position will be the
|
||||
* 'shared prefix length' of the first value in the bucket and the value to compare
|
||||
* 'shared prefix length' of the first value in the bucket and the value to compare.
|
||||
*
|
||||
* Bytes are compared using {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte, byte)}. Therefore, when the
|
||||
* values are UTF-8 encoded strings, the ordering is compatible with {@link String#compareTo(String)}.
|
||||
*/
|
||||
private static int compareBucketFirstValue(ByteBuffer bucketBuffer, int length, ByteBuffer value)
|
||||
{
|
||||
|
@ -355,7 +359,7 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
|
|||
int sharedPrefix;
|
||||
int comparison = 0;
|
||||
for (sharedPrefix = 0; sharedPrefix < commonLength; sharedPrefix++) {
|
||||
comparison = unsignedByteCompare(bucketBuffer.get(), value.get(sharedPrefix));
|
||||
comparison = StringUtils.compareUtf8UsingJavaStringOrdering(bucketBuffer.get(), value.get(sharedPrefix));
|
||||
if (comparison != 0) {
|
||||
bucketBuffer.position(startOffset + sharedPrefix);
|
||||
break;
|
||||
|
@ -403,7 +407,10 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
|
|||
final int common = Math.min(fragmentLength, value.remaining() - prefixLength);
|
||||
int fragmentComparison = 0;
|
||||
for (int i = 0; i < common; i++) {
|
||||
fragmentComparison = unsignedByteCompare(buffer.get(buffer.position() + i), value.get(prefixLength + i));
|
||||
fragmentComparison = StringUtils.compareUtf8UsingJavaStringOrdering(
|
||||
buffer.get(buffer.position() + i),
|
||||
value.get(prefixLength + i)
|
||||
);
|
||||
if (fragmentComparison != 0) {
|
||||
break;
|
||||
}
|
||||
|
@ -502,9 +509,4 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
|
|||
}
|
||||
return bucketBuffers;
|
||||
}
|
||||
|
||||
public static int unsignedByteCompare(byte b1, byte b2)
|
||||
{
|
||||
return (b1 & 0xFF) - (b2 & 0xFF);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,8 +44,8 @@ import java.nio.channels.WritableByteChannel;
|
|||
* the bucket is written entirely, and remaining values are stored as pairs of an integer which indicates how much
|
||||
* of the first byte array of the bucket to use as a prefix, followed by the remaining value bytes after the prefix.
|
||||
*
|
||||
* This is valid to use for any values which can be compared byte by byte with unsigned comparison. Otherwise, this
|
||||
* is not the collection for you.
|
||||
* This writer is designed for use with UTF-8 encoded strings that are written in an order compatible with
|
||||
* {@link String#compareTo(String)}.
|
||||
*
|
||||
* @see FrontCodedIndexed for additional details.
|
||||
*/
|
||||
|
@ -99,7 +99,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
|
|||
@Override
|
||||
public void write(@Nullable byte[] value) throws IOException
|
||||
{
|
||||
if (prevObject != null && unsignedCompare(prevObject, value) >= 0) {
|
||||
if (prevObject != null && compareNullableUtf8UsingJavaStringOrdering(prevObject, value) >= 0) {
|
||||
throw new ISE(
|
||||
"Values must be sorted and unique. Element [%s] with value [%s] is before or equivalent to [%s]",
|
||||
numWritten,
|
||||
|
@ -283,7 +283,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
|
|||
// all other values must be partitioned into a prefix length and suffix bytes
|
||||
int prefixLength = 0;
|
||||
for (; prefixLength < first.length; prefixLength++) {
|
||||
final int cmp = FrontCodedIndexed.unsignedByteCompare(first[prefixLength], next[prefixLength]);
|
||||
final int cmp = StringUtils.compareUtf8UsingJavaStringOrdering(first[prefixLength], next[prefixLength]);
|
||||
if (cmp != 0) {
|
||||
break;
|
||||
}
|
||||
|
@ -325,7 +325,11 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
|
|||
return buffer.position() - pos;
|
||||
}
|
||||
|
||||
public static int unsignedCompare(
|
||||
/**
|
||||
* Same as {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}, but accepts nulls. Nulls are
|
||||
* sorted first.
|
||||
*/
|
||||
private static int compareNullableUtf8UsingJavaStringOrdering(
|
||||
@Nullable final byte[] b1,
|
||||
@Nullable final byte[] b2
|
||||
)
|
||||
|
@ -337,15 +341,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
|
|||
if (b2 == null) {
|
||||
return 1;
|
||||
}
|
||||
final int commonLength = Math.min(b1.length, b2.length);
|
||||
|
||||
for (int i = 0; i < commonLength; i++) {
|
||||
final int cmp = FrontCodedIndexed.unsignedByteCompare(b1[i], b2[i]);
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
|
||||
return Integer.compare(b1.length, b2.length);
|
||||
return StringUtils.compareUtf8UsingJavaStringOrdering(b1, b2);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,13 +100,16 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
|
|||
private static final SerializerUtils SERIALIZER_UTILS = new SerializerUtils();
|
||||
|
||||
/**
|
||||
* An ObjectStrategy that returns a big-endian ByteBuffer pointing to the original data.
|
||||
* An ObjectStrategy that returns a big-endian ByteBuffer pointing to original data.
|
||||
*
|
||||
* The returned ByteBuffer is a fresh read-only instance, so it is OK for callers to modify its position, limit, etc.
|
||||
* However, it does point to the original data, so callers must take care not to use it if the original data may
|
||||
* have been freed.
|
||||
*
|
||||
* The compare method of this instance uses {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}
|
||||
* so that behavior is consistent with {@link #STRING_STRATEGY}.
|
||||
*/
|
||||
public static final ObjectStrategy<ByteBuffer> BYTE_BUFFER_STRATEGY = new ObjectStrategy<ByteBuffer>()
|
||||
public static final ObjectStrategy<ByteBuffer> UTF8_STRATEGY = new ObjectStrategy<ByteBuffer>()
|
||||
{
|
||||
@Override
|
||||
public Class<ByteBuffer> getClazz()
|
||||
|
@ -140,7 +143,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
|
|||
@Override
|
||||
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
|
||||
{
|
||||
return ByteBufferUtils.unsignedComparator().compare(o1, o2);
|
||||
return ByteBufferUtils.utf8Comparator().compare(o1, o2);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -541,7 +544,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
|
|||
}
|
||||
|
||||
//noinspection ObjectEquality
|
||||
final boolean isByteBufferStrategy = strategy == BYTE_BUFFER_STRATEGY;
|
||||
final boolean isByteBufferStrategy = strategy == UTF8_STRATEGY;
|
||||
|
||||
int minIndex = 0;
|
||||
int maxIndex = size - 1;
|
||||
|
@ -553,7 +556,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
|
|||
if (isByteBufferStrategy) {
|
||||
// Specialization avoids ByteBuffer allocation in strategy.fromByteBuffer.
|
||||
ByteBuffer currValue = getByteBuffer(currIndex);
|
||||
comparison = ByteBufferUtils.compareByteBuffers(currValue, (ByteBuffer) value);
|
||||
comparison = ByteBufferUtils.compareUtf8ByteBuffers(currValue, (ByteBuffer) value);
|
||||
} else {
|
||||
T currValue = get(currIndex);
|
||||
comparison = strategy.compare(currValue, value);
|
||||
|
|
|
@ -107,7 +107,7 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
|||
// this cannot happen naturally right now since generic indexed is written in the 'legacy' format, but
|
||||
// this provides backwards compatibility should we switch at some point in the future to always
|
||||
// writing dictionaryVersion
|
||||
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
|
||||
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper);
|
||||
frontCodedDictionarySupplier = null;
|
||||
} else {
|
||||
throw new ISE("impossible, unknown encoding strategy id: %s", encodingId);
|
||||
|
@ -117,7 +117,7 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
|
|||
// as dictionaryVersion is actually also the GenericIndexed version, so we reset start position so the
|
||||
// GenericIndexed version can be correctly read
|
||||
stringDictionaryBuffer.position(dictionaryStartPosition);
|
||||
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
|
||||
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper);
|
||||
frontCodedDictionarySupplier = null;
|
||||
}
|
||||
final ByteBuffer longDictionaryBuffer = loadInternalFile(
|
||||
|
|
|
@ -353,7 +353,7 @@ public class DictionaryEncodedColumnPartSerde implements ColumnPartSerde
|
|||
|
||||
final GenericIndexed<ByteBuffer> rDictionaryUtf8 = GenericIndexed.read(
|
||||
buffer,
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY,
|
||||
GenericIndexed.UTF8_STRATEGY,
|
||||
builder.getFileMapper()
|
||||
);
|
||||
|
||||
|
|
|
@ -241,7 +241,9 @@ public class FrontCodedIndexedTest extends InitializedNullHandlingTest
|
|||
public void testFrontCodedIndexedUnicodes() throws IOException
|
||||
{
|
||||
ByteBuffer buffer = ByteBuffer.allocate(1 << 12).order(order);
|
||||
List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr");
|
||||
|
||||
// "\uD83D\uDCA9" and "(請參見已被刪除版本)" are a regression test for https://github.com/apache/druid/pull/13364
|
||||
List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr", "\uD83D\uDCA9", "(請參見已被刪除版本)");
|
||||
fillBuffer(buffer, theList, 4);
|
||||
|
||||
buffer.position(0);
|
||||
|
|
|
@ -121,7 +121,7 @@ public class ExtractionDimFilterTest extends InitializedNullHandlingTest
|
|||
GenericIndexed.fromIterable(Collections.singletonList("foo1"), GenericIndexed.STRING_STRATEGY),
|
||||
GenericIndexed.fromIterable(
|
||||
Collections.singletonList(ByteBuffer.wrap(StringUtils.toUtf8("foo1"))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
),
|
||||
GenericIndexed.fromIterable(Collections.singletonList(foo1BitMap), serdeFactory.getObjectStrategy()),
|
||||
null
|
||||
|
|
|
@ -75,7 +75,7 @@ public class PredicateValueMatcherFactoryTest extends InitializedNullHandlingTes
|
|||
ByteBuffer.wrap(StringUtils.toUtf8("v2")),
|
||||
ByteBuffer.wrap(StringUtils.toUtf8("v3"))
|
||||
),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
),
|
||||
null,
|
||||
() -> VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new int[]{1}))),
|
||||
|
@ -98,7 +98,7 @@ public class PredicateValueMatcherFactoryTest extends InitializedNullHandlingTes
|
|||
ByteBuffer.wrap(StringUtils.toUtf8("v2")),
|
||||
ByteBuffer.wrap(StringUtils.toUtf8("v3"))
|
||||
),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
),
|
||||
null,
|
||||
() -> VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new int[]{1}))),
|
||||
|
|
|
@ -49,7 +49,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest
|
|||
GenericIndexed.fromIterable(ImmutableList.of("value"), GenericIndexed.STRING_STRATEGY),
|
||||
GenericIndexed.fromIterable(
|
||||
ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
),
|
||||
() -> VSizeColumnarInts.fromArray(new int[]{0}),
|
||||
null,
|
||||
|
@ -62,7 +62,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest
|
|||
ByteBuffer.wrap(StringUtils.toUtf8("value")),
|
||||
ByteBuffer.wrap(StringUtils.toUtf8("value2"))
|
||||
),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
),
|
||||
() -> VSizeColumnarInts.fromArray(new int[]{0, 0, 1, 0, 1}),
|
||||
null,
|
||||
|
@ -72,7 +72,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest
|
|||
GenericIndexed.fromIterable(ImmutableList.of("value"), GenericIndexed.STRING_STRATEGY),
|
||||
GenericIndexed.fromIterable(
|
||||
ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
),
|
||||
null,
|
||||
() -> VSizeColumnarMultiInts.fromIterable(
|
||||
|
|
|
@ -127,7 +127,7 @@ public class NestedFieldLiteralColumnIndexSupplierTest extends InitializedNullHa
|
|||
doubleWriter.write(9.9);
|
||||
writeToBuffer(doubleBuffer, doubleWriter);
|
||||
|
||||
GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY);
|
||||
GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer, GenericIndexed.UTF8_STRATEGY);
|
||||
globalStrings = () -> strings.singleThreaded();
|
||||
globalLongs = FixedIndexed.read(longBuffer, TypeStrategies.LONG, ByteOrder.nativeOrder(), Long.BYTES);
|
||||
globalDoubles = FixedIndexed.read(doubleBuffer, TypeStrategies.DOUBLE, ByteOrder.nativeOrder(), Double.BYTES);
|
||||
|
|
|
@ -115,7 +115,7 @@ public class DictionaryEncodedStringIndexSupplierTest extends InitializedNullHan
|
|||
GenericIndexedWriter<ByteBuffer> byteBufferWriter = new GenericIndexedWriter<>(
|
||||
new OnHeapMemorySegmentWriteOutMedium(),
|
||||
"byteBuffers",
|
||||
GenericIndexed.BYTE_BUFFER_STRATEGY
|
||||
GenericIndexed.UTF8_STRATEGY
|
||||
);
|
||||
|
||||
stringWriter.open();
|
||||
|
@ -167,7 +167,7 @@ public class DictionaryEncodedStringIndexSupplierTest extends InitializedNullHan
|
|||
return new DictionaryEncodedStringIndexSupplier(
|
||||
roaringFactory.getBitmapFactory(),
|
||||
GenericIndexed.read(stringBuffer, GenericIndexed.STRING_STRATEGY),
|
||||
GenericIndexed.read(byteBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY),
|
||||
GenericIndexed.read(byteBuffer, GenericIndexed.UTF8_STRATEGY),
|
||||
bitmaps,
|
||||
null
|
||||
);
|
||||
|
|
Loading…
Reference in New Issue