SQL: Support larger strings in binary protocol (elastic/x-pack-elasticsearch#3130)

While we're fairly sure we're going to remove the binary protocol in the
long run, we're also fairly sure we're going to release the first
version of SQL with the binary protocol. One big problem with it is that
it blows up when it attempts to serialize fairly long strings. These
long strings are actually quite common in the CLI. They are also
possible in JDBC. I say "fairly long strings" because exactly how long
the strings has to be is kind of funky. It is based on the number of
bytes that it takes to encode the string, and the strings are encoded in
a utf-8-like encoding of utf-16 encoded string documented here:
https://docs.oracle.com/javase/7/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)

Anyway, this fixes the protocol for these "fairly long strings" by
chunking the strings and adding an extra 4 byte integer before each
string to count the number of chunks. After that 4 byte integer the
strings are serialized using the "normal" DataInput/DataOutput encoding,
the funny utf-8-like encoding of the utf-16 encoded string.

relates elastic/x-pack-elasticsearch#3018

Original commit: elastic/x-pack-elasticsearch@11f0d59f20
This commit is contained in:
Nik Everett 2017-11-27 18:59:31 -05:00 committed by GitHub
parent df802b40c8
commit 193f22b97f
3 changed files with 92 additions and 11 deletions

View File

@ -34,6 +34,23 @@ import java.io.IOException;
return version;
}
/**
* Override the built-in {@link DataInput#readUTF()}
* to support strings that need more than 65535 charcters.
*/
@Override
public String readUTF() throws IOException {
int splits = delegate.readInt();
if (splits == 0) {
return delegate.readUTF();
}
StringBuilder b = new StringBuilder(SqlDataOutput.WORST_CASE_SPLIT * splits);
for (int i = 0; i < splits; i++) {
b.append(delegate.readUTF());
}
return b.toString();
}
@Override
public void readFully(byte[] b) throws IOException {
delegate.readFully(b);
@ -103,9 +120,4 @@ import java.io.IOException;
public String readLine() throws IOException {
return delegate.readLine();
}
@Override
public String readUTF() throws IOException {
return delegate.readUTF();
}
}
}

View File

@ -35,6 +35,34 @@ public final class SqlDataOutput implements DataOutput {
return version;
}
/**
* The maximum size of a string to submit to {@link #delegate}'s
* {@link DataOutput#writeUTF(String)}. The {@code 65535} is the
* number of bytes that the string can be encoded to. The {@code 3}
* is the "worst case" for the number of bytes used to encode each
* char.
*/
static final int WORST_CASE_SPLIT = 65535 / 3;
/**
* Override the built-in {@link DataOutput#writeUTF(String)}
* to support strings that need more than 65535 charcters.
*/
@Override
public void writeUTF(String s) throws IOException {
int splits = s.length() / WORST_CASE_SPLIT + 1;
delegate.writeInt(splits);
int start = 0;
while (true) {
int end = Math.min(s.length(), start + WORST_CASE_SPLIT);
delegate.writeUTF(s.substring(start, end));
if (end == s.length()) {
break;
}
start = end;
}
}
@Override
public void write(int b) throws IOException {
delegate.write(b);
@ -99,9 +127,4 @@ public final class SqlDataOutput implements DataOutput {
public void writeChars(String s) throws IOException {
delegate.writeChars(s);
}
@Override
public void writeUTF(String s) throws IOException {
delegate.writeUTF(s);
}
}

View File

@ -0,0 +1,46 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.sql.protocol.shared;
import org.apache.http.client.entity.DeflateInputStream;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.test.ESTestCase;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
public class SqlDataInputOutputTests extends ESTestCase {
public void testSmallString() throws IOException {
assertRoundTripString("t");
assertRoundTripString("test");
assertRoundTripString(randomAlphaOfLengthBetween(500, 1000));
}
public void testLargeAscii() throws IOException {
assertRoundTripString(randomAlphaOfLengthBetween(65535, 655350));
}
public void testUnicode() throws IOException {
assertRoundTripString(randomRealisticUnicodeOfLengthBetween(65535 / 3, 65535));
assertRoundTripString(randomRealisticUnicodeOfLengthBetween(65535, 655350));
}
/**
* Round trip a string using {@link SqlDataOutput#writeUTF(String)}
* and {@link SqlDataInput#readUTF()}.
*/
private void assertRoundTripString(String string) throws IOException {
try (BytesStreamOutput out = new BytesStreamOutput()) {
SqlDataOutput sout = new SqlDataOutput(new DataOutputStream(out), 0);
sout.writeUTF(string);
try (StreamInput in = out.bytes().streamInput()) {
SqlDataInput sin = new SqlDataInput(new DataInputStream(in), 0);
assertEquals(string, sin.readUTF());
}
}
}
}