Fix content type detection with leading whitespace (#32632)

Today content type detection on an input stream works by peeking up to
twenty bytes into the stream. If the stream is headed by more whitespace
than twenty bytes, we might fail to detect the content type. We should
be ignoring this whitespace before attempting to detect the content
type. This commit does that by ignoring all leading whitespace in an
input stream before attempting to guess the content type.
This commit is contained in:
Jason Tedor 2018-08-06 18:07:46 -04:00 committed by GitHub
parent b46e13629f
commit 3fb0923182
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 45 additions and 10 deletions

View File

@ -35,7 +35,7 @@ import java.io.OutputStream;
*/
public class XContentFactory {
private static final int GUESS_HEADER_LENGTH = 20;
static final int GUESS_HEADER_LENGTH = 20;
/**
* Returns a content builder using JSON format ({@link org.elasticsearch.common.xcontent.XContentType#JSON}.
@ -153,8 +153,10 @@ public class XContentFactory {
return XContentType.JSON;
}
// Should we throw a failure here? Smile idea is to use it in bytes....
if (length > 2 && first == SmileConstants.HEADER_BYTE_1 && content.charAt(1) == SmileConstants.HEADER_BYTE_2 &&
content.charAt(2) == SmileConstants.HEADER_BYTE_3) {
if (length > 2
&& first == SmileConstants.HEADER_BYTE_1
&& content.charAt(1) == SmileConstants.HEADER_BYTE_2
&& content.charAt(2) == SmileConstants.HEADER_BYTE_3) {
return XContentType.SMILE;
}
if (length > 2 && first == '-' && content.charAt(1) == '-' && content.charAt(2) == '-') {
@ -227,13 +229,29 @@ public class XContentFactory {
*/
@Deprecated
public static XContentType xContentType(InputStream si) throws IOException {
/*
* We need to guess the content type. To do this, we look for the first non-whitespace character and then try to guess the content
* type on the GUESS_HEADER_LENGTH bytes that follow. We do this in a way that does not modify the initial read position in the
* underlying input stream. This is why the input stream must support mark/reset and why we repeatedly mark the read position and
* reset.
*/
if (si.markSupported() == false) {
throw new IllegalArgumentException("Cannot guess the xcontent type without mark/reset support on " + si.getClass());
}
si.mark(GUESS_HEADER_LENGTH);
si.mark(Integer.MAX_VALUE);
try {
// scan until we find the first non-whitespace character or the end of the stream
int current;
do {
current = si.read();
if (current == -1) {
return null;
}
} while (Character.isWhitespace((char) current));
// now guess the content type off the next GUESS_HEADER_LENGTH bytes including the current byte
final byte[] firstBytes = new byte[GUESS_HEADER_LENGTH];
int read = 0;
firstBytes[0] = (byte) current;
int read = 1;
while (read < GUESS_HEADER_LENGTH) {
final int r = si.read(firstBytes, read, GUESS_HEADER_LENGTH - read);
if (r == -1) {
@ -245,6 +263,7 @@ public class XContentFactory {
} finally {
si.reset();
}
}
/**
@ -278,15 +297,17 @@ public class XContentFactory {
if (first == '{') {
return XContentType.JSON;
}
if (length > 2 && first == SmileConstants.HEADER_BYTE_1 && bytes[offset + 1] == SmileConstants.HEADER_BYTE_2 &&
bytes[offset + 2] == SmileConstants.HEADER_BYTE_3) {
if (length > 2
&& first == SmileConstants.HEADER_BYTE_1
&& bytes[offset + 1] == SmileConstants.HEADER_BYTE_2
&& bytes[offset + 2] == SmileConstants.HEADER_BYTE_3) {
return XContentType.SMILE;
}
if (length > 2 && first == '-' && bytes[offset + 1] == '-' && bytes[offset + 2] == '-') {
return XContentType.YAML;
}
// CBOR logic similar to CBORFactory#hasCBORFormat
if (first == CBORConstants.BYTE_OBJECT_INDEFINITE && length > 1){
if (first == CBORConstants.BYTE_OBJECT_INDEFINITE && length > 1) {
return XContentType.CBOR;
}
if (CBORConstants.hasMajorType(CBORConstants.MAJOR_TYPE_TAG, first) && length > 2) {

View File

@ -28,6 +28,7 @@ import org.elasticsearch.test.ESTestCase;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Arrays;
import static org.hamcrest.Matchers.equalTo;
@ -54,8 +55,21 @@ public class XContentFactoryTests extends ESTestCase {
builder.field("field1", "value1");
builder.endObject();
assertThat(XContentHelper.xContentType(BytesReference.bytes(builder)), equalTo(type));
assertThat(XContentFactory.xContentType(BytesReference.bytes(builder).streamInput()), equalTo(type));
final BytesReference bytes;
if (type == XContentType.JSON && randomBoolean()) {
final int length = randomIntBetween(0, 8 * XContentFactory.GUESS_HEADER_LENGTH);
final String content = Strings.toString(builder);
final StringBuilder sb = new StringBuilder(length + content.length());
final char[] chars = new char[length];
Arrays.fill(chars, ' ');
sb.append(new String(chars)).append(content);
bytes = new BytesArray(sb.toString());
} else {
bytes = BytesReference.bytes(builder);
}
assertThat(XContentHelper.xContentType(bytes), equalTo(type));
assertThat(XContentFactory.xContentType(bytes.streamInput()), equalTo(type));
// CBOR is binary, cannot use String
if (type != XContentType.CBOR && type != XContentType.SMILE) {