Fix content type detection with leading whitespace (#32632)

Today content type detection on an input stream works by peeking up to twenty bytes into the stream. If the stream is headed by more whitespace than twenty bytes, we might fail to detect the content type. We should be ignoring this whitespace before attempting to detect the content type. This commit does that by ignoring all leading whitespace in an input stream before attempting to guess the content type.
2018-08-06 18:07:46 -04:00 · 2018-08-06 18:07:46 -04:00 · 3fb0923182
parent b46e13629f
commit 3fb0923182
2 changed files with 45 additions and 10 deletions
--- a/libs/x-content/src/main/java/org/elasticsearch/common/xcontent/XContentFactory.java
+++ b/libs/x-content/src/main/java/org/elasticsearch/common/xcontent/XContentFactory.java
@ -35,7 +35,7 @@ import java.io.OutputStream;
 */
 public class XContentFactory {

-    private static final int GUESS_HEADER_LENGTH = 20;
+    static final int GUESS_HEADER_LENGTH = 20;

    /**
     * Returns a content builder using JSON format ({@link org.elasticsearch.common.xcontent.XContentType#JSON}.
@ -153,8 +153,10 @@ public class XContentFactory {
            return XContentType.JSON;
        }
        // Should we throw a failure here? Smile idea is to use it in bytes....
-        if (length > 2 && first == SmileConstants.HEADER_BYTE_1 && content.charAt(1) == SmileConstants.HEADER_BYTE_2 &&
-            content.charAt(2) == SmileConstants.HEADER_BYTE_3) {
+        if (length > 2
+                && first == SmileConstants.HEADER_BYTE_1
+                && content.charAt(1) == SmileConstants.HEADER_BYTE_2
+                && content.charAt(2) == SmileConstants.HEADER_BYTE_3) {
            return XContentType.SMILE;
        }
        if (length > 2 && first == '-' && content.charAt(1) == '-' && content.charAt(2) == '-') {
@ -227,13 +229,29 @@ public class XContentFactory {
     */
    @Deprecated
    public static XContentType xContentType(InputStream si) throws IOException {
+        /*
+         * We need to guess the content type. To do this, we look for the first non-whitespace character and then try to guess the content
+         * type on the GUESS_HEADER_LENGTH bytes that follow. We do this in a way that does not modify the initial read position in the
+         * underlying input stream. This is why the input stream must support mark/reset and why we repeatedly mark the read position and
+         * reset.
+         */
        if (si.markSupported() == false) {
            throw new IllegalArgumentException("Cannot guess the xcontent type without mark/reset support on " + si.getClass());
        }
-        si.mark(GUESS_HEADER_LENGTH);
+        si.mark(Integer.MAX_VALUE);
        try {
+            // scan until we find the first non-whitespace character or the end of the stream
+            int current;
+            do {
+                current = si.read();
+                if (current == -1) {
+                    return null;
+                }
+            } while (Character.isWhitespace((char) current));
+            // now guess the content type off the next GUESS_HEADER_LENGTH bytes including the current byte
            final byte[] firstBytes = new byte[GUESS_HEADER_LENGTH];
-            int read = 0;
+            firstBytes[0] = (byte) current;
+            int read = 1;
            while (read < GUESS_HEADER_LENGTH) {
                final int r = si.read(firstBytes, read, GUESS_HEADER_LENGTH - read);
                if (r == -1) {
@ -245,6 +263,7 @@ public class XContentFactory {
        } finally {
            si.reset();
        }
+
    }

    /**
@ -278,8 +297,10 @@ public class XContentFactory {
        if (first == '{') {
            return XContentType.JSON;
        }
-        if (length > 2 && first == SmileConstants.HEADER_BYTE_1 && bytes[offset + 1] == SmileConstants.HEADER_BYTE_2 &&
-            bytes[offset + 2] == SmileConstants.HEADER_BYTE_3) {
+        if (length > 2
+                && first == SmileConstants.HEADER_BYTE_1
+                && bytes[offset + 1] == SmileConstants.HEADER_BYTE_2
+                && bytes[offset + 2] == SmileConstants.HEADER_BYTE_3) {
            return XContentType.SMILE;
        }
        if (length > 2 && first == '-' && bytes[offset + 1] == '-' && bytes[offset + 2] == '-') {
--- a/server/src/test/java/org/elasticsearch/common/xcontent/XContentFactoryTests.java
+++ b/server/src/test/java/org/elasticsearch/common/xcontent/XContentFactoryTests.java
@ -28,6 +28,7 @@ import org.elasticsearch.test.ESTestCase;

 import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.util.Arrays;

 import static org.hamcrest.Matchers.equalTo;

@ -54,8 +55,21 @@ public class XContentFactoryTests extends ESTestCase {
        builder.field("field1", "value1");
        builder.endObject();

-        assertThat(XContentHelper.xContentType(BytesReference.bytes(builder)), equalTo(type));
-        assertThat(XContentFactory.xContentType(BytesReference.bytes(builder).streamInput()), equalTo(type));
+        final BytesReference bytes;
+        if (type == XContentType.JSON && randomBoolean()) {
+            final int length = randomIntBetween(0, 8 * XContentFactory.GUESS_HEADER_LENGTH);
+            final String content = Strings.toString(builder);
+            final StringBuilder sb = new StringBuilder(length + content.length());
+            final char[] chars = new char[length];
+            Arrays.fill(chars, ' ');
+            sb.append(new String(chars)).append(content);
+            bytes = new BytesArray(sb.toString());
+        } else {
+            bytes = BytesReference.bytes(builder);
+        }
+
+        assertThat(XContentHelper.xContentType(bytes), equalTo(type));
+        assertThat(XContentFactory.xContentType(bytes.streamInput()), equalTo(type));

        // CBOR is binary, cannot use String
        if (type != XContentType.CBOR && type != XContentType.SMILE) {