From 193f22b97f96b4f3877af66ad8aa881ef6908d7f Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Mon, 27 Nov 2017 18:59:31 -0500 Subject: [PATCH] SQL: Support larger strings in binary protocol (elastic/x-pack-elasticsearch#3130) While we're fairly sure we're going to remove the binary protocol in the long run, we're also fairly sure we're going to release the first version of SQL with the binary protocol. One big problem with it is that it blows up when it attempts to serialize fairly long strings. These long strings are actually quite common in the CLI. They are also possible in JDBC. I say "fairly long strings" because exactly how long the strings has to be is kind of funky. It is based on the number of bytes that it takes to encode the string, and the strings are encoded in a utf-8-like encoding of utf-16 encoded string documented here: https://docs.oracle.com/javase/7/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String) Anyway, this fixes the protocol for these "fairly long strings" by chunking the strings and adding an extra 4 byte integer before each string to count the number of chunks. After that 4 byte integer the strings are serialized using the "normal" DataInput/DataOutput encoding, the funny utf-8-like encoding of the utf-16 encoded string. relates elastic/x-pack-elasticsearch#3018 Original commit: elastic/x-pack-elasticsearch@11f0d59f203815e97a559266697273002b1723c8 --- .../sql/protocol/shared/SqlDataInput.java | 24 +++++++--- .../sql/protocol/shared/SqlDataOutput.java | 33 +++++++++++-- .../shared/SqlDataInputOutputTests.java | 46 +++++++++++++++++++ 3 files changed, 92 insertions(+), 11 deletions(-) create mode 100644 sql/shared-proto/src/test/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInputOutputTests.java diff --git a/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInput.java b/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInput.java index e6d692661ce..14941f066cd 100644 --- a/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInput.java +++ b/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInput.java @@ -34,6 +34,23 @@ import java.io.IOException; return version; } + /** + * Override the built-in {@link DataInput#readUTF()} + * to support strings that need more than 65535 charcters. + */ + @Override + public String readUTF() throws IOException { + int splits = delegate.readInt(); + if (splits == 0) { + return delegate.readUTF(); + } + StringBuilder b = new StringBuilder(SqlDataOutput.WORST_CASE_SPLIT * splits); + for (int i = 0; i < splits; i++) { + b.append(delegate.readUTF()); + } + return b.toString(); + } + @Override public void readFully(byte[] b) throws IOException { delegate.readFully(b); @@ -103,9 +120,4 @@ import java.io.IOException; public String readLine() throws IOException { return delegate.readLine(); } - - @Override - public String readUTF() throws IOException { - return delegate.readUTF(); - } -} \ No newline at end of file +} diff --git a/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataOutput.java b/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataOutput.java index a52afe21562..1171c3c6c16 100644 --- a/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataOutput.java +++ b/sql/shared-proto/src/main/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataOutput.java @@ -35,6 +35,34 @@ public final class SqlDataOutput implements DataOutput { return version; } + /** + * The maximum size of a string to submit to {@link #delegate}'s + * {@link DataOutput#writeUTF(String)}. The {@code 65535} is the + * number of bytes that the string can be encoded to. The {@code 3} + * is the "worst case" for the number of bytes used to encode each + * char. + */ + static final int WORST_CASE_SPLIT = 65535 / 3; + /** + * Override the built-in {@link DataOutput#writeUTF(String)} + * to support strings that need more than 65535 charcters. + */ + @Override + public void writeUTF(String s) throws IOException { + int splits = s.length() / WORST_CASE_SPLIT + 1; + delegate.writeInt(splits); + + int start = 0; + while (true) { + int end = Math.min(s.length(), start + WORST_CASE_SPLIT); + delegate.writeUTF(s.substring(start, end)); + if (end == s.length()) { + break; + } + start = end; + } + } + @Override public void write(int b) throws IOException { delegate.write(b); @@ -99,9 +127,4 @@ public final class SqlDataOutput implements DataOutput { public void writeChars(String s) throws IOException { delegate.writeChars(s); } - - @Override - public void writeUTF(String s) throws IOException { - delegate.writeUTF(s); - } } diff --git a/sql/shared-proto/src/test/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInputOutputTests.java b/sql/shared-proto/src/test/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInputOutputTests.java new file mode 100644 index 00000000000..7e438c0ee7e --- /dev/null +++ b/sql/shared-proto/src/test/java/org/elasticsearch/xpack/sql/protocol/shared/SqlDataInputOutputTests.java @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.sql.protocol.shared; + +import org.apache.http.client.entity.DeflateInputStream; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.test.ESTestCase; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +public class SqlDataInputOutputTests extends ESTestCase { + public void testSmallString() throws IOException { + assertRoundTripString("t"); + assertRoundTripString("test"); + assertRoundTripString(randomAlphaOfLengthBetween(500, 1000)); + } + + public void testLargeAscii() throws IOException { + assertRoundTripString(randomAlphaOfLengthBetween(65535, 655350)); + } + + public void testUnicode() throws IOException { + assertRoundTripString(randomRealisticUnicodeOfLengthBetween(65535 / 3, 65535)); + assertRoundTripString(randomRealisticUnicodeOfLengthBetween(65535, 655350)); + } + + /** + * Round trip a string using {@link SqlDataOutput#writeUTF(String)} + * and {@link SqlDataInput#readUTF()}. + */ + private void assertRoundTripString(String string) throws IOException { + try (BytesStreamOutput out = new BytesStreamOutput()) { + SqlDataOutput sout = new SqlDataOutput(new DataOutputStream(out), 0); + sout.writeUTF(string); + try (StreamInput in = out.bytes().streamInput()) { + SqlDataInput sin = new SqlDataInput(new DataInputStream(in), 0); + assertEquals(string, sin.readUTF()); + } + } + } +}