From e0cfd9ee19f8e9013592f585c10b5bb660173fce Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Tue, 15 Mar 2016 16:38:37 +0900 Subject: [PATCH] Utility method for length estimation of utf8 --- .../io/druid/common/utils/StringUtils.java | 47 +++++++++++ .../druid/common/utils/StringUtilsTest.java | 80 +++++++++++++++++++ .../druid/query/metadata/SegmentAnalyzer.java | 6 +- 3 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 common/src/main/java/io/druid/common/utils/StringUtils.java create mode 100644 common/src/test/java/io/druid/common/utils/StringUtilsTest.java diff --git a/common/src/main/java/io/druid/common/utils/StringUtils.java b/common/src/main/java/io/druid/common/utils/StringUtils.java new file mode 100644 index 00000000000..11b4482421a --- /dev/null +++ b/common/src/main/java/io/druid/common/utils/StringUtils.java @@ -0,0 +1,47 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.common.utils; + +/** + */ +public class StringUtils extends com.metamx.common.StringUtils +{ + // should be used only for estimation + // returns the same result with StringUtils.fromUtf8(value).length for valid string values + // does not check validity of format and returns over-estimated result for invalid string (see UT) + public static int estimatedBinaryLengthAsUTF8(String value) + { + int length = 0; + for (int i = 0; i < value.length(); i++) { + char var10 = value.charAt(i); + if (var10 < 0x80) { + length += 1; + } else if (var10 < 0x800) { + length += 2; + } else if (Character.isSurrogate(var10)) { + length += 4; + i++; + } else { + length += 3; + } + } + return length; + } +} diff --git a/common/src/test/java/io/druid/common/utils/StringUtilsTest.java b/common/src/test/java/io/druid/common/utils/StringUtilsTest.java new file mode 100644 index 00000000000..42891751229 --- /dev/null +++ b/common/src/test/java/io/druid/common/utils/StringUtilsTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to Metamarkets Group Inc. (Metamarkets) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Metamarkets licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package io.druid.common.utils; + +import org.junit.Assert; +import org.junit.Test; + +import java.io.UnsupportedEncodingException; + +/** + */ +public class StringUtilsTest +{ + // copied from https://github.com/druid-io/druid/pull/2612 + public final static String[] TEST_STRINGS = new String[]{ + "peach", "péché", "pêche", "sin", "", + "☃", "C", "c", "Ç", "ç", "G", "g", "Ğ", "ğ", "I", "ı", "İ", "i", + "O", "o", "Ö", "ö", "S", "s", "Ş", "ş", "U", "u", "Ü", "ü", "ä", + "\uD841\uDF0E", + "\uD841\uDF31", + "\uD844\uDC5C", + "\uD84F\uDCB7", + "\uD860\uDEE2", + "\uD867\uDD98", + "\u006E\u0303", + "\u006E", + "\uFB00", + "\u0066\u0066", + "Å", + "\u00C5", + "\u212B" + }; + + @Test + public void binaryLengthAsUTF8Test() throws UnsupportedEncodingException + { + for (String string : TEST_STRINGS) { + Assert.assertEquals(StringUtils.toUtf8(string).length, StringUtils.estimatedBinaryLengthAsUTF8(string)); + } + } + + @Test + public void binaryLengthAsUTF8InvalidTest() throws UnsupportedEncodingException + { + // we can fix this but looks trivial case, imho + String invalid = "\uD841"; // high only + Assert.assertEquals(1, StringUtils.toUtf8(invalid).length); + Assert.assertEquals(4, StringUtils.estimatedBinaryLengthAsUTF8(invalid)); + + invalid = "\uD841\uD841"; // high + high + Assert.assertEquals(2, StringUtils.toUtf8(invalid).length); + Assert.assertEquals(4, StringUtils.estimatedBinaryLengthAsUTF8(invalid)); + + invalid = "\uD841\u0050"; // high + char + Assert.assertEquals(2, StringUtils.toUtf8(invalid).length); + Assert.assertEquals(4, StringUtils.estimatedBinaryLengthAsUTF8(invalid)); + + invalid = "\uDEE2\uD841"; // low + high + Assert.assertEquals(2, StringUtils.toUtf8(invalid).length); + Assert.assertEquals(4, StringUtils.estimatedBinaryLengthAsUTF8(invalid)); + } + +} diff --git a/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java b/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java index 00abd48c703..d67cb5bc6d2 100644 --- a/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java +++ b/processing/src/main/java/io/druid/query/metadata/SegmentAnalyzer.java @@ -26,10 +26,10 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.primitives.Longs; -import com.metamx.common.StringUtils; import com.metamx.common.guava.Accumulator; import com.metamx.common.guava.Sequence; import com.metamx.common.logger.Logger; +import io.druid.common.utils.StringUtils; import io.druid.granularity.QueryGranularity; import io.druid.query.dimension.DefaultDimensionSpec; import io.druid.query.metadata.metadata.ColumnAnalysis; @@ -206,7 +206,7 @@ public class SegmentAnalyzer for (int i = 0; i < cardinality; ++i) { String value = bitmapIndex.getValue(i); if (value != null) { - size += StringUtils.toUtf8(value).length * bitmapIndex.getBitmap(bitmapIndex.getIndex(value)).size(); + size += StringUtils.estimatedBinaryLengthAsUTF8(value) * bitmapIndex.getBitmap(bitmapIndex.getIndex(value)).size(); } } } @@ -272,7 +272,7 @@ public class SegmentAnalyzer for (int i = 0; i < vals.size(); ++i) { final String dimVal = selector.lookupName(vals.get(i)); if (dimVal != null && !dimVal.isEmpty()) { - current += StringUtils.toUtf8(dimVal).length; + current += StringUtils.estimatedBinaryLengthAsUTF8(dimVal); } } cursor.advance();