Add string comparison methods to StringUtils, fix dictionary comparisons. (#13364)

* Add string comparison methods to StringUtils, fix dictionary comparisons.

There are various places in Druid code where we assume that String.compareTo
is consistent with Unicode code-point ordering. Sadly this is not the case.

To help deal with this, this patch introduces the following helpers:

1) compareUnicode: Compares two Strings in Unicode code-point order.
2) compareUtf8: Compares two UTF-8 byte arrays in Unicode code-point order.
   Equivalent to comparison as unsigned bytes.
3) compareUtf8UsingJavaStringOrdering: Compares two UTF-8 byte arrays, or
   ByteBuffers, in a manner consistent with String.compareTo.

There is no helper for comparing two Strings in a manner consistent
with String.compareTo, because for that we can use compareTo directly.

The patch also fixes an inconsistency between the String and UTF-8
dictionary GenericIndexed flavors of string-typed columns: they were
formerly using incompatible comparators.

* Adjust test.

* FrontCodedIndexed updates.

* Add test.

* Fix comments.
This commit is contained in:
Gian Merlino 2022-11-16 07:15:00 -08:00 committed by GitHub
parent 71b133f3ff
commit 78d0b0abce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 359 additions and 105 deletions

View File

@ -179,7 +179,7 @@ public class BoundFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
);
selector = new MockColumnIndexSelector(
bitmapFactory,

View File

@ -101,7 +101,7 @@ public class DictionaryEncodedStringIndexSupplierBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
() -> IntStream.range(0, dictionarySize)

View File

@ -130,7 +130,7 @@ public class DimensionPredicateFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
FluentIterable.from(ints)

View File

@ -174,7 +174,7 @@ public class FrontCodedIndexedBenchmark
genericIndexed = GenericIndexed.read(
byteBufferGeneric,
GenericIndexed.BYTE_BUFFER_STRATEGY,
GenericIndexed.UTF8_STRATEGY,
SmooshedFileMapper.load(smooshDirFrontCoded)
);
frontCodedIndexed = FrontCodedIndexed.read(

View File

@ -93,7 +93,7 @@ public class InFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
() -> IntStream.range(0, dictionarySize)

View File

@ -130,7 +130,7 @@ public class LikeFilterBenchmark
final GenericIndexed<ByteBuffer> dictionaryUtf8 = GenericIndexed.fromIterable(
FluentIterable.from(ints)
.transform(i -> ByteBuffer.wrap(StringUtils.toUtf8(String.valueOf(i)))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
);
final GenericIndexed<ImmutableBitmap> bitmaps = GenericIndexed.fromIterable(
FluentIterable.from(ints)

View File

@ -47,7 +47,7 @@ public class ByteBufferUtils
// null if unmap is supported
private static final RuntimeException UNMAP_NOT_SUPPORTED_EXCEPTION;
private static final Comparator<ByteBuffer> COMPARATOR_UNSIGNED = new UnsignedByteBufferComparator();
private static final Comparator<ByteBuffer> COMPARATOR_UTF8 = new Utf8ByteBufferComparator();
static {
Object unmap = null;
@ -214,40 +214,12 @@ public class ByteBufferUtils
}
/**
* Compares two ByteBuffer ranges using unsigned byte ordering.
* Compares two ByteBuffers from their positions to their limits using ordering consistent with
* {@link String#compareTo(String)}. Null buffers are accepted, and are ordered earlier than any nonnull buffer.
*
* Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
* Different from {@link ByteBuffer#compareTo}, which uses signed-bytes ordering.
*/
public static int compareByteBuffers(
final ByteBuffer buf1,
final int position1,
final int length1,
final ByteBuffer buf2,
final int position2,
final int length2
)
{
final int commonLength = Math.min(length1, length2);
for (int i = 0; i < commonLength; i++) {
final byte byte1 = buf1.get(position1 + i);
final byte byte2 = buf2.get(position2 + i);
final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
if (cmp != 0) {
return cmp;
}
}
return Integer.compare(length1, length2);
}
/**
* Compares two ByteBuffers from their positions to their limits using unsigned byte ordering. Accepts null
* buffers, which are ordered earlier than any nonnull buffer.
*
* Different from {@link ByteBuffer#compareTo}, which uses signed ordering.
*/
public static int compareByteBuffers(
public static int compareUtf8ByteBuffers(
@Nullable final ByteBuffer buf1,
@Nullable final ByteBuffer buf2
)
@ -260,7 +232,7 @@ public class ByteBufferUtils
return 1;
}
return ByteBufferUtils.compareByteBuffers(
return StringUtils.compareUtf8UsingJavaStringOrdering(
buf1,
buf1.position(),
buf1.remaining(),
@ -271,20 +243,20 @@ public class ByteBufferUtils
}
/**
* Comparator that compares two {@link ByteBuffer} using unsigned ordering. Null buffers are accepted, and
* are ordered earlier than any nonnull buffer.
* Comparator that compares two {@link ByteBuffer} using ordering consistent with {@link String#compareTo(String)}.
* Null buffers are accepted, and are ordered earlier than any nonnull buffer.
*/
public static Comparator<ByteBuffer> unsignedComparator()
public static Comparator<ByteBuffer> utf8Comparator()
{
return COMPARATOR_UNSIGNED;
return COMPARATOR_UTF8;
}
private static class UnsignedByteBufferComparator implements Comparator<ByteBuffer>
private static class Utf8ByteBufferComparator implements Comparator<ByteBuffer>
{
@Override
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
{
return ByteBufferUtils.compareByteBuffers(o1, o2);
return compareUtf8ByteBuffers(o1, o2);
}
}
}

View File

@ -77,6 +77,148 @@ public class StringUtils
return string == null ? EMPTY_BYTES : toUtf8(string);
}
/**
* Compares two Java Strings in Unicode code-point order.
*
* Order is consistent with {@link #compareUtf8(byte[], byte[])}, but is not consistent with
* {@link String#compareTo(String)}.
*/
public static int compareUnicode(final String a, final String b)
{
final int commonLength = Math.min(a.length(), b.length());
for (int i = 0; i < commonLength; i++) {
int char1 = a.charAt(i) & 0xFFFF; // Unsigned
int char2 = b.charAt(i) & 0xFFFF; // Unsigned
if (char1 != char2 && char1 >= 0xd800 && char2 >= 0xd800) {
// Fixup logic for code units at or above the surrogate range, based on logic described at
// https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
//
// If both code units are at or above the surrogate range (>= 0xd800) then adjust non-surrogates (legitimate
// single-code-unit characters) to be below the surrogate range, so they compare earlier than surrogates.
if (!Character.isSurrogate((char) char1)) {
char1 -= 0x2800;
}
if (!Character.isSurrogate((char) char2)) {
char2 -= 0x2800;
}
}
final int cmp = char1 - char2;
if (cmp != 0) {
return cmp;
}
}
return Integer.compare(a.length(), b.length());
}
/**
* Compares two UTF-8 byte strings in Unicode code-point order.
*
* Equivalent to a comparison of the two byte arrays as if they were unsigned bytes.
*
* Order is consistent with {@link #compareUnicode(String, String)}, but is not consistent with
* {@link String#compareTo(String)}. For an ordering consistent with {@link String#compareTo(String)}, use
* {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} instead.
*/
public static int compareUtf8(final byte[] a, final byte[] b)
{
final int commonLength = Math.min(a.length, b.length);
for (int i = 0; i < commonLength; i++) {
final byte byte1 = a[i];
final byte byte2 = b[i];
final int cmp = (byte1 & 0xFF) - (byte2 & 0xFF); // Unsigned comparison
if (cmp != 0) {
return cmp;
}
}
return Integer.compare(a.length, b.length);
}
/**
* Compares two UTF-8 byte strings in UTF-16 code-unit order.
*
* Order is consistent with {@link String#compareTo(String)}, but is not consistent with
* {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], byte[])}.
*/
public static int compareUtf8UsingJavaStringOrdering(final byte[] a, final byte[] b)
{
final int commonLength = Math.min(a.length, b.length);
for (int i = 0; i < commonLength; i++) {
final int cmp = compareUtf8UsingJavaStringOrdering(a[i], b[i]);
if (cmp != 0) {
return cmp;
}
}
return Integer.compare(a.length, b.length);
}
/**
* Compares two UTF-8 byte strings in UTF-16 code-unit order.
*
* Order is consistent with {@link String#compareTo(String)}, but is not consistent with
* {@link #compareUnicode(String, String)} or {@link #compareUtf8(byte[], byte[])}.
*/
public static int compareUtf8UsingJavaStringOrdering(
final ByteBuffer buf1,
final int position1,
final int length1,
final ByteBuffer buf2,
final int position2,
final int length2
)
{
final int commonLength = Math.min(length1, length2);
for (int i = 0; i < commonLength; i++) {
final int cmp = compareUtf8UsingJavaStringOrdering(buf1.get(position1 + i), buf2.get(position2 + i));
if (cmp != 0) {
return cmp;
}
}
return Integer.compare(length1, length2);
}
/**
* Compares two bytes from UTF-8 strings in such a way that the entire byte arrays are compared in UTF-16
* code-unit order.
*
* Compatible with {@link #compareUtf8UsingJavaStringOrdering(byte[], byte[])} and
* {@link #compareUtf8UsingJavaStringOrdering(ByteBuffer, int, int, ByteBuffer, int, int)}.
*/
public static int compareUtf8UsingJavaStringOrdering(byte byte1, byte byte2)
{
// Treat as unsigned bytes.
int ubyte1 = byte1 & 0xFF;
int ubyte2 = byte2 & 0xFF;
if (ubyte1 != ubyte2 && ubyte1 >= 0xEE && ubyte2 >= 0xEE) {
// Fixup logic for lead bytes for U+E000 ... U+FFFF, based on logic described at
// https://www.icu-project.org/docs/papers/utf16_code_point_order.html.
//
// Move possible lead bytes for this range (0xEE and 0xEF) above all other bytes, so they compare later.
if (ubyte1 == 0xEE || ubyte1 == 0xEF) {
ubyte1 += 0xFF;
}
if (ubyte2 == 0xEE || ubyte2 == 0xEF) {
ubyte2 += 0xFF;
}
}
return ubyte1 - ubyte2;
}
public static String fromUtf8(final byte[] bytes)
{
try {

View File

@ -19,6 +19,7 @@
package org.apache.druid.java.util.common;
import com.google.common.collect.ImmutableList;
import com.google.common.io.Files;
import org.apache.druid.collections.ResourceHolder;
import org.hamcrest.MatcherAssert;
@ -36,9 +37,28 @@ import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
public class ByteBufferUtilsTest
{
private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
"(請參見已被刪除版本)",
"請參見已被刪除版本",
"שָׁלוֹם",
"{{[[Template:別名重定向|別名重定向]]}}",
"\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
"\uD83D\uDCA9",
"",
"f",
"fo",
"\uD83D\uDE42",
"\uD83E\uDEE5",
"\uD83E\uDD20",
"quick",
"brown",
"fox"
);
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder();
@ -82,9 +102,9 @@ public class ByteBufferUtilsTest
@Test
@SuppressWarnings("EqualsWithItself")
public void testUnsignedComparator()
public void testUtf8Comparator()
{
final Comparator<ByteBuffer> comparator = ByteBufferUtils.unsignedComparator();
final Comparator<ByteBuffer> comparator = ByteBufferUtils.utf8Comparator();
// Tests involving null
MatcherAssert.assertThat(comparator.compare(null, null), Matchers.equalTo(0));
@ -112,18 +132,33 @@ public class ByteBufferUtilsTest
Matchers.greaterThan(0)
);
// Tests involving the full range of bytes
for (byte i = Byte.MIN_VALUE; i < Byte.MAX_VALUE; i++) {
for (byte j = Byte.MIN_VALUE; j < Byte.MAX_VALUE; j++) {
final int cmp = Integer.compare(Byte.toUnsignedInt(i), Byte.toUnsignedInt(j));
for (final String string1 : COMPARE_TEST_STRINGS) {
for (final String string2 : COMPARE_TEST_STRINGS) {
final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
final ByteBuffer utf8ByteBuffer1 = ByteBuffer.allocate(utf8Bytes1.length + 2);
final ByteBuffer utf8ByteBuffer2 = ByteBuffer.allocate(utf8Bytes2.length + 2);
utf8ByteBuffer1.position(1);
utf8ByteBuffer1.put(utf8Bytes1, 0, utf8Bytes1.length).position(utf8Bytes1.length);
utf8ByteBuffer1.position(1).limit(1 + utf8Bytes1.length);
utf8ByteBuffer2.position(1);
utf8ByteBuffer2.put(utf8Bytes2, 0, utf8Bytes2.length).position(utf8Bytes2.length);
utf8ByteBuffer2.position(1).limit(1 + utf8Bytes2.length);
MatcherAssert.assertThat(
StringUtils.format("comparison of %s to %s", Byte.toUnsignedInt(i), Byte.toUnsignedInt(j)),
comparator.compare(
ByteBuffer.wrap(new byte[]{i}),
ByteBuffer.wrap(new byte[]{j})
final int compareByteBufferUtilsUtf8 = ByteBufferUtils.utf8Comparator().compare(
utf8ByteBuffer1,
utf8ByteBuffer2
);
Assert.assertEquals(
StringUtils.format(
"compareByteBufferUtilsUtf8(byte[]) (actual) "
+ "matches compareJavaString (expected) for [%s] vs [%s]",
string1,
string2
),
cmp < 0 ? Matchers.lessThan(0) : cmp > 0 ? Matchers.greaterThan(0) : Matchers.equalTo(0)
(int) Math.signum(string1.compareTo(string2)),
(int) Math.signum(compareByteBufferUtilsUtf8)
);
}
}

View File

@ -19,6 +19,7 @@
package org.apache.druid.java.util.common;
import com.google.common.collect.ImmutableList;
import org.apache.druid.collections.ResourceHolder;
import org.junit.Assert;
import org.junit.Rule;
@ -28,12 +29,31 @@ import org.junit.rules.ExpectedException;
import java.io.UnsupportedEncodingException;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.util.List;
/**
*
*/
public class StringUtilsTest
{
private static final List<String> COMPARE_TEST_STRINGS = ImmutableList.of(
"(請參見已被刪除版本)",
"請參見已被刪除版本",
"שָׁלוֹם",
"{{[[Template:別名重定向|別名重定向]]}}",
"\uD83D\uDC4D\uD83D\uDC4D\uD83D\uDC4D",
"\uD83D\uDCA9",
"",
"f",
"fo",
"\uD83D\uDE42",
"\uD83E\uDEE5",
"\uD83E\uDD20",
"quick",
"brown",
"fox"
);
@Rule
public ExpectedException expectedException = ExpectedException.none();
@ -290,4 +310,80 @@ public class StringUtilsTest
Assert.assertEquals("smile ", StringUtils.fastLooseChop("smile 🙂 for the camera", 6));
Assert.assertEquals("smile", StringUtils.fastLooseChop("smile 🙂 for the camera", 5));
}
@Test
public void testUnicodeStringCompare()
{
for (final String string1 : COMPARE_TEST_STRINGS) {
for (final String string2 : COMPARE_TEST_STRINGS) {
final int compareUnicode = StringUtils.compareUnicode(string1, string2);
final int compareUtf8 = StringUtils.compareUtf8(
StringUtils.toUtf8(string1),
StringUtils.toUtf8(string2)
);
Assert.assertEquals(
StringUtils.format(
"compareUnicode (actual) matches compareUtf8 (expected) for [%s] vs [%s]",
string1,
string2
),
(int) Math.signum(compareUtf8),
(int) Math.signum(compareUnicode)
);
}
}
}
@Test
public void testJavaStringCompare()
{
for (final String string1 : COMPARE_TEST_STRINGS) {
for (final String string2 : COMPARE_TEST_STRINGS) {
final int compareJavaString = string1.compareTo(string2);
final byte[] utf8Bytes1 = StringUtils.toUtf8(string1);
final byte[] utf8Bytes2 = StringUtils.toUtf8(string2);
final int compareByteArrayUtf8UsingJavaStringOrdering =
StringUtils.compareUtf8UsingJavaStringOrdering(utf8Bytes1, utf8Bytes2);
final ByteBuffer utf8ByteBuffer1 = ByteBuffer.allocate(utf8Bytes1.length + 2);
final ByteBuffer utf8ByteBuffer2 = ByteBuffer.allocate(utf8Bytes2.length + 2);
utf8ByteBuffer1.position(1);
utf8ByteBuffer1.put(utf8Bytes1, 0, utf8Bytes1.length).position(utf8Bytes1.length);
utf8ByteBuffer2.position(1);
utf8ByteBuffer2.put(utf8Bytes2, 0, utf8Bytes2.length).position(utf8Bytes2.length);
final int compareByteBufferUtf8UsingJavaStringOrdering = StringUtils.compareUtf8UsingJavaStringOrdering(
utf8ByteBuffer1,
1,
utf8Bytes1.length,
utf8ByteBuffer2,
1,
utf8Bytes2.length
);
Assert.assertEquals(
StringUtils.format(
"compareUtf8UsingJavaStringOrdering(byte[]) (actual) "
+ "matches compareJavaString (expected) for [%s] vs [%s]",
string1,
string2
),
(int) Math.signum(compareJavaString),
(int) Math.signum(compareByteArrayUtf8UsingJavaStringOrdering)
);
Assert.assertEquals(
StringUtils.format(
"compareByteBufferUtf8UsingJavaStringOrdering(ByteBuffer) (actual) "
+ "matches compareJavaString (expected) for [%s] vs [%s]",
string1,
string2
),
(int) Math.signum(compareJavaString),
(int) Math.signum(compareByteBufferUtf8UsingJavaStringOrdering)
);
}
}
}
}

View File

@ -205,7 +205,7 @@ public class FrameWriterUtils
/**
* Copies "len" bytes from {@code src.position()} to "dstPosition" in "memory". Does not update the position of src.
*
* @throws InvalidNullByteException "allowNullBytes" is true and a null byte is encountered
* @throws InvalidNullByteException if "allowNullBytes" is false and a null byte is encountered
*/
public static void copyByteBufferToMemory(
final ByteBuffer src,

View File

@ -674,7 +674,7 @@ public class InDimFilter extends AbstractOptimizableDimFilter implements Filter
public SortedSet<ByteBuffer> toUtf8()
{
final TreeSet<ByteBuffer> valuesUtf8 = new TreeSet<>(ByteBufferUtils.unsignedComparator());
final TreeSet<ByteBuffer> valuesUtf8 = new TreeSet<>(ByteBufferUtils.utf8Comparator());
for (final String value : values) {
if (value == null) {

View File

@ -47,9 +47,15 @@ public class StringComparators
public static final int STRLEN_CACHE_ID = 0x04;
public static final int VERSION_CACHE_ID = 0x05;
/**
* Comparison using the natural comparator of {@link String}.
*
* Note that this is not equivalent to comparing UTF-8 byte arrays; see javadocs for
* {@link org.apache.druid.java.util.common.StringUtils#compareUnicode(String, String)} and
* {@link org.apache.druid.java.util.common.StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}.
*/
public static class LexicographicComparator extends StringComparator
{
// Equivalent to comparing UTF-8 encoded strings as byte arrays.
private static final Ordering<String> ORDERING = Ordering.from(String::compareTo).nullsFirst();
@Override

View File

@ -379,7 +379,7 @@ public class IndexIO
// Duplicate the first buffer since we are reading the dictionary twice.
dimValueLookups.put(dimension, GenericIndexed.read(dimBuffer.duplicate(), GenericIndexed.STRING_STRATEGY));
dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY));
dimValueUtf8Lookups.put(dimension, GenericIndexed.read(dimBuffer, GenericIndexed.UTF8_STRATEGY));
dimColumns.put(dimension, VSizeColumnarMultiInts.readFromByteBuffer(dimBuffer));
}

View File

@ -47,7 +47,7 @@ public final class IndexedUtf8ValueSetIndex<TDictionary extends Indexed<ByteBuff
// sorted merge instead of binary-search based algorithm.
private static final double SORTED_MERGE_RATIO_THRESHOLD = 0.12D;
private static final int SIZE_WORTH_CHECKING_MIN = 8;
private static final Comparator<ByteBuffer> COMPARATOR = ByteBufferUtils.unsignedComparator();
private static final Comparator<ByteBuffer> COMPARATOR = ByteBufferUtils.utf8Comparator();
private final BitmapFactory bitmapFactory;
private final TDictionary dictionary;

View File

@ -29,7 +29,7 @@ public interface Utf8ValueSetIndex
/**
* Get an {@link Iterable} of {@link ImmutableBitmap} corresponding to the specified set of values (if they are
* contained in the underlying column). The set must be sorted using
* {@link org.apache.druid.java.util.common.ByteBufferUtils#unsignedComparator()}.
* {@link org.apache.druid.java.util.common.ByteBufferUtils#utf8Comparator()}.
*/
BitmapColumnIndex forSortedValuesUtf8(SortedSet<ByteBuffer> valuesUtf8);
}

View File

@ -23,6 +23,7 @@ import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
import javax.annotation.Nullable;
@ -340,10 +341,13 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
/**
* Performs an unsigned byte comparison of the first value in a bucket with the specified value. Note that this method
* Performs byte-by-byte comparison of the first value in a bucket with the specified value. Note that this method
* MUST be prepared before calling, as it expects the length of the first value to have already been read externally,
* and the buffer position to be at the start of the first bucket value. The final buffer position will be the
* 'shared prefix length' of the first value in the bucket and the value to compare
* 'shared prefix length' of the first value in the bucket and the value to compare.
*
* Bytes are compared using {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte, byte)}. Therefore, when the
* values are UTF-8 encoded strings, the ordering is compatible with {@link String#compareTo(String)}.
*/
private static int compareBucketFirstValue(ByteBuffer bucketBuffer, int length, ByteBuffer value)
{
@ -355,7 +359,7 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
int sharedPrefix;
int comparison = 0;
for (sharedPrefix = 0; sharedPrefix < commonLength; sharedPrefix++) {
comparison = unsignedByteCompare(bucketBuffer.get(), value.get(sharedPrefix));
comparison = StringUtils.compareUtf8UsingJavaStringOrdering(bucketBuffer.get(), value.get(sharedPrefix));
if (comparison != 0) {
bucketBuffer.position(startOffset + sharedPrefix);
break;
@ -403,7 +407,10 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
final int common = Math.min(fragmentLength, value.remaining() - prefixLength);
int fragmentComparison = 0;
for (int i = 0; i < common; i++) {
fragmentComparison = unsignedByteCompare(buffer.get(buffer.position() + i), value.get(prefixLength + i));
fragmentComparison = StringUtils.compareUtf8UsingJavaStringOrdering(
buffer.get(buffer.position() + i),
value.get(prefixLength + i)
);
if (fragmentComparison != 0) {
break;
}
@ -502,9 +509,4 @@ public final class FrontCodedIndexed implements Indexed<ByteBuffer>
}
return bucketBuffers;
}
public static int unsignedByteCompare(byte b1, byte b2)
{
return (b1 & 0xFF) - (b2 & 0xFF);
}
}

View File

@ -44,8 +44,8 @@ import java.nio.channels.WritableByteChannel;
* the bucket is written entirely, and remaining values are stored as pairs of an integer which indicates how much
* of the first byte array of the bucket to use as a prefix, followed by the remaining value bytes after the prefix.
*
* This is valid to use for any values which can be compared byte by byte with unsigned comparison. Otherwise, this
* is not the collection for you.
* This writer is designed for use with UTF-8 encoded strings that are written in an order compatible with
* {@link String#compareTo(String)}.
*
* @see FrontCodedIndexed for additional details.
*/
@ -99,7 +99,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
@Override
public void write(@Nullable byte[] value) throws IOException
{
if (prevObject != null && unsignedCompare(prevObject, value) >= 0) {
if (prevObject != null && compareNullableUtf8UsingJavaStringOrdering(prevObject, value) >= 0) {
throw new ISE(
"Values must be sorted and unique. Element [%s] with value [%s] is before or equivalent to [%s]",
numWritten,
@ -283,7 +283,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
// all other values must be partitioned into a prefix length and suffix bytes
int prefixLength = 0;
for (; prefixLength < first.length; prefixLength++) {
final int cmp = FrontCodedIndexed.unsignedByteCompare(first[prefixLength], next[prefixLength]);
final int cmp = StringUtils.compareUtf8UsingJavaStringOrdering(first[prefixLength], next[prefixLength]);
if (cmp != 0) {
break;
}
@ -325,7 +325,11 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
return buffer.position() - pos;
}
public static int unsignedCompare(
/**
* Same as {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}, but accepts nulls. Nulls are
* sorted first.
*/
private static int compareNullableUtf8UsingJavaStringOrdering(
@Nullable final byte[] b1,
@Nullable final byte[] b2
)
@ -337,15 +341,7 @@ public class FrontCodedIndexedWriter implements DictionaryWriter<byte[]>
if (b2 == null) {
return 1;
}
final int commonLength = Math.min(b1.length, b2.length);
for (int i = 0; i < commonLength; i++) {
final int cmp = FrontCodedIndexed.unsignedByteCompare(b1[i], b2[i]);
if (cmp != 0) {
return cmp;
}
}
return Integer.compare(b1.length, b2.length);
return StringUtils.compareUtf8UsingJavaStringOrdering(b1, b2);
}
}

View File

@ -100,13 +100,16 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
private static final SerializerUtils SERIALIZER_UTILS = new SerializerUtils();
/**
* An ObjectStrategy that returns a big-endian ByteBuffer pointing to the original data.
* An ObjectStrategy that returns a big-endian ByteBuffer pointing to original data.
*
* The returned ByteBuffer is a fresh read-only instance, so it is OK for callers to modify its position, limit, etc.
* However, it does point to the original data, so callers must take care not to use it if the original data may
* have been freed.
*
* The compare method of this instance uses {@link StringUtils#compareUtf8UsingJavaStringOrdering(byte[], byte[])}
* so that behavior is consistent with {@link #STRING_STRATEGY}.
*/
public static final ObjectStrategy<ByteBuffer> BYTE_BUFFER_STRATEGY = new ObjectStrategy<ByteBuffer>()
public static final ObjectStrategy<ByteBuffer> UTF8_STRATEGY = new ObjectStrategy<ByteBuffer>()
{
@Override
public Class<ByteBuffer> getClazz()
@ -140,7 +143,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
@Override
public int compare(@Nullable ByteBuffer o1, @Nullable ByteBuffer o2)
{
return ByteBufferUtils.unsignedComparator().compare(o1, o2);
return ByteBufferUtils.utf8Comparator().compare(o1, o2);
}
};
@ -541,7 +544,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
}
//noinspection ObjectEquality
final boolean isByteBufferStrategy = strategy == BYTE_BUFFER_STRATEGY;
final boolean isByteBufferStrategy = strategy == UTF8_STRATEGY;
int minIndex = 0;
int maxIndex = size - 1;
@ -553,7 +556,7 @@ public class GenericIndexed<T> implements CloseableIndexed<T>, Serializer
if (isByteBufferStrategy) {
// Specialization avoids ByteBuffer allocation in strategy.fromByteBuffer.
ByteBuffer currValue = getByteBuffer(currIndex);
comparison = ByteBufferUtils.compareByteBuffers(currValue, (ByteBuffer) value);
comparison = ByteBufferUtils.compareUtf8ByteBuffers(currValue, (ByteBuffer) value);
} else {
T currValue = get(currIndex);
comparison = strategy.compare(currValue, value);

View File

@ -107,7 +107,7 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
// this cannot happen naturally right now since generic indexed is written in the 'legacy' format, but
// this provides backwards compatibility should we switch at some point in the future to always
// writing dictionaryVersion
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper);
frontCodedDictionarySupplier = null;
} else {
throw new ISE("impossible, unknown encoding strategy id: %s", encodingId);
@ -117,7 +117,7 @@ public class NestedDataColumnSupplier implements Supplier<ComplexColumn>
// as dictionaryVersion is actually also the GenericIndexed version, so we reset start position so the
// GenericIndexed version can be correctly read
stringDictionaryBuffer.position(dictionaryStartPosition);
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY, mapper);
dictionary = GenericIndexed.read(stringDictionaryBuffer, GenericIndexed.UTF8_STRATEGY, mapper);
frontCodedDictionarySupplier = null;
}
final ByteBuffer longDictionaryBuffer = loadInternalFile(

View File

@ -353,7 +353,7 @@ public class DictionaryEncodedColumnPartSerde implements ColumnPartSerde
final GenericIndexed<ByteBuffer> rDictionaryUtf8 = GenericIndexed.read(
buffer,
GenericIndexed.BYTE_BUFFER_STRATEGY,
GenericIndexed.UTF8_STRATEGY,
builder.getFileMapper()
);

View File

@ -241,7 +241,9 @@ public class FrontCodedIndexedTest extends InitializedNullHandlingTest
public void testFrontCodedIndexedUnicodes() throws IOException
{
ByteBuffer buffer = ByteBuffer.allocate(1 << 12).order(order);
List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr");
// "\uD83D\uDCA9" and "(請參見已被刪除版本)" are a regression test for https://github.com/apache/druid/pull/13364
List<String> theList = ImmutableList.of("Győ-Moson-Sopron", "Győr", "\uD83D\uDCA9", "(請參見已被刪除版本)");
fillBuffer(buffer, theList, 4);
buffer.position(0);

View File

@ -121,7 +121,7 @@ public class ExtractionDimFilterTest extends InitializedNullHandlingTest
GenericIndexed.fromIterable(Collections.singletonList("foo1"), GenericIndexed.STRING_STRATEGY),
GenericIndexed.fromIterable(
Collections.singletonList(ByteBuffer.wrap(StringUtils.toUtf8("foo1"))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
),
GenericIndexed.fromIterable(Collections.singletonList(foo1BitMap), serdeFactory.getObjectStrategy()),
null

View File

@ -75,7 +75,7 @@ public class PredicateValueMatcherFactoryTest extends InitializedNullHandlingTes
ByteBuffer.wrap(StringUtils.toUtf8("v2")),
ByteBuffer.wrap(StringUtils.toUtf8("v3"))
),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
),
null,
() -> VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new int[]{1}))),
@ -98,7 +98,7 @@ public class PredicateValueMatcherFactoryTest extends InitializedNullHandlingTes
ByteBuffer.wrap(StringUtils.toUtf8("v2")),
ByteBuffer.wrap(StringUtils.toUtf8("v3"))
),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
),
null,
() -> VSizeColumnarMultiInts.fromIterable(ImmutableList.of(VSizeColumnarInts.fromArray(new int[]{1}))),

View File

@ -49,7 +49,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest
GenericIndexed.fromIterable(ImmutableList.of("value"), GenericIndexed.STRING_STRATEGY),
GenericIndexed.fromIterable(
ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
),
() -> VSizeColumnarInts.fromArray(new int[]{0}),
null,
@ -62,7 +62,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest
ByteBuffer.wrap(StringUtils.toUtf8("value")),
ByteBuffer.wrap(StringUtils.toUtf8("value2"))
),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
),
() -> VSizeColumnarInts.fromArray(new int[]{0, 0, 1, 0, 1}),
null,
@ -72,7 +72,7 @@ public class ValueMatchersTest extends InitializedNullHandlingTest
GenericIndexed.fromIterable(ImmutableList.of("value"), GenericIndexed.STRING_STRATEGY),
GenericIndexed.fromIterable(
ImmutableList.of(ByteBuffer.wrap(StringUtils.toUtf8("value"))),
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
),
null,
() -> VSizeColumnarMultiInts.fromIterable(

View File

@ -127,7 +127,7 @@ public class NestedFieldLiteralColumnIndexSupplierTest extends InitializedNullHa
doubleWriter.write(9.9);
writeToBuffer(doubleBuffer, doubleWriter);
GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY);
GenericIndexed<ByteBuffer> strings = GenericIndexed.read(stringBuffer, GenericIndexed.UTF8_STRATEGY);
globalStrings = () -> strings.singleThreaded();
globalLongs = FixedIndexed.read(longBuffer, TypeStrategies.LONG, ByteOrder.nativeOrder(), Long.BYTES);
globalDoubles = FixedIndexed.read(doubleBuffer, TypeStrategies.DOUBLE, ByteOrder.nativeOrder(), Double.BYTES);

View File

@ -115,7 +115,7 @@ public class DictionaryEncodedStringIndexSupplierTest extends InitializedNullHan
GenericIndexedWriter<ByteBuffer> byteBufferWriter = new GenericIndexedWriter<>(
new OnHeapMemorySegmentWriteOutMedium(),
"byteBuffers",
GenericIndexed.BYTE_BUFFER_STRATEGY
GenericIndexed.UTF8_STRATEGY
);
stringWriter.open();
@ -167,7 +167,7 @@ public class DictionaryEncodedStringIndexSupplierTest extends InitializedNullHan
return new DictionaryEncodedStringIndexSupplier(
roaringFactory.getBitmapFactory(),
GenericIndexed.read(stringBuffer, GenericIndexed.STRING_STRATEGY),
GenericIndexed.read(byteBuffer, GenericIndexed.BYTE_BUFFER_STRATEGY),
GenericIndexed.read(byteBuffer, GenericIndexed.UTF8_STRATEGY),
bitmaps,
null
);