SQL: Add option to provide the delimiter for the CSV format (#59907) (#60420)

* SQL: Add option to provide the delimiter for the CSV format (#59907) * Add option to provide the delimiter to the CSV fmt This adds the option to provide the desired character as the separator for the CSV format (the default remains comma). A set of characters are excluded though - like CR, LF, `"` - to avoid slipping onto the CSV-dialects slope. The tab is also forbidden, the user needs to choose the "tsv" format explicitely. Update the doc to make it clear that the textual CSV, TSV and TXT formats pass the cursor back to the user through the Cursor HTTP header. (cherry picked from commit 3a8b00cc7480f7ada57fcea3cbac957facac08fc) * Java8 fixes - replace Set#of(); - URLDecoder#decode() requires a string (vs a charset) as 2nd arg.
2020-07-29 21:40:11 +02:00 · 2020-07-29 21:40:11 +02:00 · 8c22adc447
parent 30610d962a
commit 8c22adc447
5 changed files with 200 additions and 52 deletions
--- a/docs/reference/sql/endpoints/rest.asciidoc
+++ b/docs/reference/sql/endpoints/rest.asciidoc
@ -15,7 +15,7 @@
 === Overview

 The SQL REST API accepts SQL in a JSON document, executes it,
-and returns the results. 
+and returns the results.
 For example:

 [source,console]
@ -106,6 +106,10 @@ s|Description

 |===

+The `CSV` format accepts a formatting URL query attribute, `delimiter`, which indicates which character should be used to separate the CSV
+values. It defaults to comma (`,`) and cannot take any of the following values: double quote (`"`), carriage-return (`\r`) and new-line (`\n`).
+The tab (`\t`) can also not be used, the `tsv` format needs to be used instead.
+
 Here are some examples for the human readable formats:

 ==== CSV
@ -120,7 +124,7 @@ POST /_sql?format=csv
 --------------------------------------------------
 // TEST[setup:library]

-Which returns:
+which returns:

 [source,text]
 --------------------------------------------------
@ -133,6 +137,31 @@ James S.A. Corey,Leviathan Wakes,561,2011-06-02T00:00:00.000Z
 --------------------------------------------------
 // TESTRESPONSE[non_json]

+or:
+
+[source,console]
+--------------------------------------------------
+POST /_sql?format=csv&delimiter=%3b
+{
+    "query": "SELECT * FROM library ORDER BY page_count DESC",
+    "fetch_size": 5
+}
+--------------------------------------------------
+// TEST[setup:library]
+
+which returns:
+
+[source,text]
+--------------------------------------------------
+author;name;page_count;release_date
+Peter F. Hamilton;Pandora's Star;768;2004-03-02T00:00:00.000Z
+Vernor Vinge;A Fire Upon the Deep;613;1992-06-01T00:00:00.000Z
+Frank Herbert;Dune;604;1965-06-01T00:00:00.000Z
+Alastair Reynolds;Revelation Space;585;2000-03-15T00:00:00.000Z
+James S.A. Corey;Leviathan Wakes;561;2011-06-02T00:00:00.000Z
+--------------------------------------------------
+// TESTRESPONSE[non_json]
+
 ==== JSON

 [source,console]
@ -210,7 +239,7 @@ Which returns:

 [source,text]
 --------------------------------------------------
-     author      |        name        |  page_count   |      release_date      
+     author      |        name        |  page_count   |      release_date
 -----------------+--------------------+---------------+------------------------
 Peter F. Hamilton|Pandora's Star      |768            |2004-03-02T00:00:00.000Z
 Vernor Vinge     |A Fire Upon the Deep|613            |1992-06-01T00:00:00.000Z
@ -275,8 +304,8 @@ cursor: "sDXF1ZXJ5QW5kRmV0Y2gBAAAAAAAAAAEWWWdrRlVfSS1TbDYtcW9lc1FJNmlYdw==:BAFmB
 === Paginating through a large response

 Using the example from the <<sql-rest-format,previous section>>, one can
-continue to the next page by sending back the cursor field. In case of text
-format, the cursor is returned as `Cursor` http header.
+continue to the next page by sending back the cursor field. In the case of CSV, TSV and TXT
+formats, the cursor is returned in the `Cursor` HTTP header.

 [source,console]
 --------------------------------------------------
--- a/x-pack/plugin/sql/sql-proto/src/main/java/org/elasticsearch/xpack/sql/proto/Protocol.java
+++ b/x-pack/plugin/sql/sql-proto/src/main/java/org/elasticsearch/xpack/sql/proto/Protocol.java
@ -50,15 +50,21 @@ public final class Protocol {
    public static final TimeValue PAGE_TIMEOUT = TimeValue.timeValueSeconds(45);
    public static final boolean FIELD_MULTI_VALUE_LENIENCY = false;
    public static final boolean INDEX_INCLUDE_FROZEN = false;
-    
+
    /*
-     * Using the Boolean object here so that SqlTranslateRequest to set this to null (since it doesn't need a "columnar" or 
+     * Using the Boolean object here so that SqlTranslateRequest to set this to null (since it doesn't need a "columnar" or
     * binary parameter).
     * See {@code SqlTranslateRequest.toXContent}
     */
    public static final Boolean COLUMNAR = Boolean.FALSE;
    public static final Boolean BINARY_COMMUNICATION = null;

+    /*
+     * URL parameters
+     */
+    public static final String URL_PARAM_FORMAT = "format";
+    public static final String URL_PARAM_DELIMITER = "delimiter";
+
    /**
     * SQL-related endpoints
     */
--- a/x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/plugin/RestSqlQueryAction.java
+++ b/x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/plugin/RestSqlQueryAction.java
@ -24,16 +24,22 @@ import org.elasticsearch.xpack.sql.proto.Protocol;

 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
+import java.util.Collections;
 import java.util.List;
+import java.util.Set;

 import static java.util.Arrays.asList;
 import static java.util.Collections.emptyList;
 import static java.util.Collections.unmodifiableList;
 import static org.elasticsearch.rest.RestRequest.Method.GET;
 import static org.elasticsearch.rest.RestRequest.Method.POST;
+import static org.elasticsearch.xpack.sql.proto.Protocol.URL_PARAM_DELIMITER;
+import static org.elasticsearch.xpack.sql.proto.Protocol.URL_PARAM_FORMAT;

 public class RestSqlQueryAction extends BaseRestHandler {

+    TextFormat textFormat;
+
    @Override
    public List<Route> routes() {
        return emptyList();
@ -77,7 +83,7 @@ public class RestSqlQueryAction extends BaseRestHandler {
            // enforce CBOR response for drivers and CLI (unless instructed differently through the config param)
            accept = XContentType.CBOR.name();
        } else {
-            accept = request.param("format");
+            accept = request.param(URL_PARAM_FORMAT);
        }
        if (accept == null) {
            accept = request.header("Accept");
@ -99,7 +105,7 @@ public class RestSqlQueryAction extends BaseRestHandler {
         * which we turn into a 400 error.
         */
        XContentType xContentType = accept == null ? XContentType.JSON : XContentType.fromMediaTypeOrFormat(accept);
-        TextFormat textFormat = xContentType == null ? TextFormat.fromMediaTypeOrFormat(accept) : null;
+        textFormat = xContentType == null ? TextFormat.fromMediaTypeOrFormat(accept) : null;

        if (xContentType == null && sqlRequest.columnar()) {
            throw new IllegalArgumentException("Invalid use of [columnar] argument: cannot be used in combination with "
@ -136,6 +142,11 @@ public class RestSqlQueryAction extends BaseRestHandler {
        });
    }

+    @Override
+    protected Set<String> responseParams() {
+        return textFormat == TextFormat.CSV ? Collections.singleton(URL_PARAM_DELIMITER) : Collections.emptySet();
+    }
+
    @Override
    public String getName() {
        return "sql_query";
--- a/x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/plugin/TextFormat.java
+++ b/x-pack/plugin/sql/src/main/java/org/elasticsearch/xpack/sql/plugin/TextFormat.java
@ -17,6 +17,9 @@ import org.elasticsearch.xpack.sql.session.Cursor;
 import org.elasticsearch.xpack.sql.session.Cursors;
 import org.elasticsearch.xpack.sql.util.DateUtils;

+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
 import java.time.ZoneId;
 import java.time.ZonedDateTime;
 import java.util.List;
@ -25,6 +28,7 @@ import java.util.Objects;
 import java.util.function.Function;

 import static org.elasticsearch.xpack.sql.action.BasicFormatter.FormatOption.TEXT;
+import static org.elasticsearch.xpack.sql.proto.Protocol.URL_PARAM_DELIMITER;

 /**
 * Templating class for displaying SQL responses in text formats.
@ -80,16 +84,16 @@ enum TextFormat {

        @Override
        String shortName() {
-            return "txt";
+            return FORMAT_TEXT;
        }

        @Override
        String contentType() {
-            return "text/plain";
+            return CONTENT_TYPE_TXT;
        }

        @Override
-        protected String delimiter() {
+        protected Character delimiter() {
            throw new UnsupportedOperationException();
        }

@ -109,40 +113,68 @@ enum TextFormat {
     *
     */
    CSV() {
-
        @Override
-        protected String delimiter() {
-            return ",";
+        protected Character delimiter() {
+            return ',';
        }

        @Override
        protected String eol() {
-            //LFCR
+            //CRLF
            return "\r\n";
        }

        @Override
        String shortName() {
-            return "csv";
+            return FORMAT_CSV;
        }

        @Override
        String contentType() {
-            return "text/csv";
+            return CONTENT_TYPE_CSV;
        }

        @Override
        String contentType(RestRequest request) {
-            return contentType() + "; charset=utf-8; header=" + (hasHeader(request) ? "present" : "absent");
+            return contentType() + "; charset=utf-8; " +
+                URL_PARAM_HEADER + "=" + (hasHeader(request) ? PARAM_HEADER_PRESENT : PARAM_HEADER_ABSENT);
        }

        @Override
-        String maybeEscape(String value) {
+        protected Character delimiter(RestRequest request) {
+            String delimiterParam = request.param(URL_PARAM_DELIMITER);
+            if (delimiterParam == null) {
+                return delimiter();
+            }
+            try {
+                delimiterParam = URLDecoder.decode(delimiterParam, StandardCharsets.UTF_8.toString());
+            } catch (UnsupportedEncodingException uee) {
+                throw new IllegalArgumentException("delimiter [" + delimiterParam + "] cannot be decoded: " + uee.getMessage(), uee);
+            }
+            if (delimiterParam.length() != 1) {
+                throw new IllegalArgumentException("invalid " +
+                    (delimiterParam.length() > 0 ? "multi-character" : "empty") + " delimiter [" + delimiterParam + "]");
+            }
+            Character delimiter = delimiterParam.charAt(0);
+            switch (delimiter) {
+                case '"':
+                case '\n':
+                case '\r':
+                    throw new IllegalArgumentException("illegal reserved character specified as delimiter [" + delimiter + "]");
+                case '\t':
+                    throw new IllegalArgumentException("illegal delimiter [TAB] specified as delimiter for the [csv] format; " +
+                        "choose the [tsv] format instead");
+            }
+            return delimiter;
+        }
+
+        @Override
+        String maybeEscape(String value, Character delimiter) {
            boolean needsEscaping = false;

            for (int i = 0; i < value.length(); i++) {
                char c = value.charAt(i);
-                if (c == '"' || c == ',' || c == '\n' || c == '\r') {
+                if (c == '"' || c == '\n' || c == '\r' || c == delimiter) {
                    needsEscaping = true;
                    break;
                }
@ -162,20 +194,21 @@ enum TextFormat {
                sb.append('"');
                value = sb.toString();
            }
+
            return value;
        }

        @Override
        boolean hasHeader(RestRequest request) {
-            String header = request.param("header");
+            String header = request.param(URL_PARAM_HEADER);
            if (header == null) {
                List<String> values = request.getAllHeaderValues("Accept");
                if (values != null) {
-                    // header is a parameter specified by ; so try breaking it down
+                    // header values are separated by `;` so try breaking it down
                    for (String value : values) {
                        String[] params = Strings.tokenizeToStringArray(value, ";");
                        for (String param : params) {
-                            if (param.toLowerCase(Locale.ROOT).equals("header=absent")) {
+                            if (param.toLowerCase(Locale.ROOT).equals(URL_PARAM_HEADER + "=" + PARAM_HEADER_ABSENT)) {
                                return false;
                            }
                        }
@ -183,31 +216,31 @@ enum TextFormat {
                }
                return true;
            } else {
-                return !header.toLowerCase(Locale.ROOT).equals("absent");
+                return !header.toLowerCase(Locale.ROOT).equals(PARAM_HEADER_ABSENT);
            }
        }
    },

    TSV() {
        @Override
-        protected String delimiter() {
-            return "\t";
+        protected Character delimiter() {
+            return '\t';
        }

        @Override
        protected String eol() {
-            // only CR
+            // only LF
            return "\n";
        }

        @Override
        String shortName() {
-            return "tsv";
+            return FORMAT_TSV;
        }

        @Override
        String contentType() {
-            return "text/tab-separated-values";
+            return CONTENT_TYPE_TSV;
        }

        @Override
@ -216,7 +249,7 @@ enum TextFormat {
        }

        @Override
-        String maybeEscape(String value) {
+        String maybeEscape(String value, Character __) {
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < value.length(); i++) {
@ -237,17 +270,27 @@ enum TextFormat {
        }
    };

+    private static final String FORMAT_TEXT = "txt";
+    private static final String FORMAT_CSV = "csv";
+    private static final String FORMAT_TSV = "tsv";
+    private static final String CONTENT_TYPE_TXT = "text/plain";
+    private static final String CONTENT_TYPE_CSV = "text/csv";
+    private static final String CONTENT_TYPE_TSV = "text/tab-separated-values";
+    private static final String URL_PARAM_HEADER = "header";
+    private static final String PARAM_HEADER_ABSENT = "absent";
+    private static final String PARAM_HEADER_PRESENT = "present";

    String format(RestRequest request, SqlQueryResponse response) {
        StringBuilder sb = new StringBuilder();

        // if the header is requested (and the column info is present - namely it's the first page) return the info
        if (hasHeader(request) && response.columns() != null) {
-            row(sb, response.columns(), ColumnInfo::name);
+            row(sb, response.columns(), ColumnInfo::name, delimiter(request));
        }

        for (List<Object> row : response.rows()) {
-            row(sb, row, f -> f instanceof ZonedDateTime ? DateUtils.toString((ZonedDateTime) f) : Objects.toString(f, StringUtils.EMPTY));
+            row(sb, row, f -> f instanceof ZonedDateTime ? DateUtils.toString((ZonedDateTime) f) : Objects.toString(f, StringUtils.EMPTY),
+                delimiter(request));
        }

        return sb.toString();
@ -292,11 +335,11 @@ enum TextFormat {
    }

    // utility method for consuming a row.
-    <F> void row(StringBuilder sb, List<F> row, Function<F, String> toString) {
+    <F> void row(StringBuilder sb, List<F> row, Function<F, String> toString, Character delimiter) {
        for (int i = 0; i < row.size(); i++) {
-            sb.append(maybeEscape(toString.apply(row.get(i))));
+            sb.append(maybeEscape(toString.apply(row.get(i)), delimiter));
            if (i < row.size() - 1) {
-                sb.append(delimiter());
+                sb.append(delimiter);
            }
        }
        sb.append(eol());
@ -305,7 +348,11 @@ enum TextFormat {
    /**
     * Delimiter between fields
     */
-    protected abstract String delimiter();
+    protected abstract Character delimiter();
+
+    protected Character delimiter(RestRequest request) {
+        return delimiter();
+    }

    /**
     * String indicating end-of-line or row.
@ -315,7 +362,7 @@ enum TextFormat {
    /**
     * Method used for escaping (if needed) a given value.
     */
-    String maybeEscape(String value) {
+    String maybeEscape(String value, Character delimiter) {
        return value;
    }
 }
--- a/x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/plugin/TextFormatTests.java
+++ b/x-pack/plugin/sql/src/test/java/org/elasticsearch/xpack/sql/plugin/TextFormatTests.java
@ -15,7 +15,9 @@ import org.elasticsearch.xpack.sql.proto.ColumnInfo;
 import org.elasticsearch.xpack.sql.proto.Mode;

 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
+import java.util.stream.Collectors;

 import static java.util.Arrays.asList;
 import static java.util.Collections.emptyList;
@ -52,7 +54,7 @@ public class TextFormatTests extends ESTestCase {
    }

    public void testCsvContentTypeWithoutHeader() {
-        assertEquals("text/csv; charset=utf-8; header=absent", CSV.contentType(reqNoHeader()));
+        assertEquals("text/csv; charset=utf-8; header=absent", CSV.contentType(reqWithParam("header", "absent")));
    }

    public void testTsvContentType() {
@ -60,19 +62,20 @@ public class TextFormatTests extends ESTestCase {
    }

    public void testCsvEscaping() {
-        assertEquals("string", CSV.maybeEscape("string"));
-        assertEquals("", CSV.maybeEscape(""));
-        assertEquals("\"\"\"\"", CSV.maybeEscape("\""));
-        assertEquals("\"\"\",\"\"\"", CSV.maybeEscape("\",\""));
-        assertEquals("\"\"\"quo\"\"ted\"\"\"", CSV.maybeEscape("\"quo\"ted\""));
+        assertEquals("string", CSV.maybeEscape("string", CSV.delimiter()));
+        assertEquals("", CSV.maybeEscape("", CSV.delimiter()));
+        assertEquals("\"\"\"\"", CSV.maybeEscape("\"", CSV.delimiter()));
+        assertEquals("\"\"\",\"\"\"", CSV.maybeEscape("\",\"", CSV.delimiter()));
+        assertEquals("\"\"\"quo\"\"ted\"\"\"", CSV.maybeEscape("\"quo\"ted\"", CSV.delimiter()));
+        assertEquals("\"one;two\"", CSV.maybeEscape("one;two", ';'));
    }

    public void testTsvEscaping() {
-        assertEquals("string", TSV.maybeEscape("string"));
-        assertEquals("", TSV.maybeEscape(""));
-        assertEquals("\"", TSV.maybeEscape("\""));
-        assertEquals("\\t", TSV.maybeEscape("\t"));
-        assertEquals("\\n\"\\t", TSV.maybeEscape("\n\"\t"));
+        assertEquals("string", TSV.maybeEscape("string", null));
+        assertEquals("", TSV.maybeEscape("", null));
+        assertEquals("\"", TSV.maybeEscape("\"", null));
+        assertEquals("\\t", TSV.maybeEscape("\t", null));
+        assertEquals("\\n\"\\t", TSV.maybeEscape("\n\"\t", null));
    }

    public void testCsvFormatWithEmptyData() {
@ -90,7 +93,32 @@ public class TextFormatTests extends ESTestCase {
        assertEquals("string,number\r\n" +
                "Along The River Bank,708\r\n" +
                "Mind Train,280\r\n",
-                text);
+            text);
+    }
+
+    public void testCsvFormatNoHeaderWithRegularData() {
+        String text = CSV.format(reqWithParam("header", "absent"), regularData());
+        assertEquals("Along The River Bank,708\r\n" +
+                "Mind Train,280\r\n",
+            text);
+    }
+
+    public void testCsvFormatWithCustomDelimiterRegularData() {
+        List<Character> forbidden = Arrays.asList('"', '\r', '\n', '\t');
+        Character delim = randomValueOtherThanMany(forbidden::contains, () -> randomAlphaOfLength(1).charAt(0));
+        String text = CSV.format(reqWithParam("delimiter", String.valueOf(delim)), regularData());
+        List<String> terms = Arrays.asList("string", "number", "Along The River Bank", "708", "Mind Train", "280");
+        List<String> expectedTerms = terms.stream()
+            .map(x -> x.contains(String.valueOf(delim)) ? '"' + x + '"' : x)
+            .collect(Collectors.toList());
+        StringBuffer sb = new StringBuffer();
+        do {
+            sb.append(expectedTerms.remove(0));
+            sb.append(delim);
+            sb.append(expectedTerms.remove(0));
+            sb.append("\r\n");
+        } while (expectedTerms.size() > 0);
+        assertEquals(sb.toString(), text);
    }

    public void testTsvFormatWithRegularData() {
@ -106,6 +134,14 @@ public class TextFormatTests extends ESTestCase {
        assertEquals("first,\"\"\"special\"\"\"\r\n" +
                "normal,\"\"\"quo\"\"ted\"\",\n\"\r\n" +
                "commas,\"a,b,c,\n,d,e,\t\n\"\r\n"
+            , text);
+    }
+
+    public void testCsvFormatWithCustomDelimiterEscapedData() {
+        String text = CSV.format(reqWithParam("delimiter", "\\"), escapedData());
+        assertEquals("first\\\"\"\"special\"\"\"\r\n" +
+                "normal\\\"\"\"quo\"\"ted\"\",\n\"\r\n" +
+                "commas\\\"a,b,c,\n,d,e,\t\n\"\r\n"
                , text);
    }

@ -117,6 +153,25 @@ public class TextFormatTests extends ESTestCase {
                , text);
    }

+    public void testInvalidCsvDelims() {
+        List<String> invalid = Arrays.asList("\"", "\r", "\n", "\t", "", "ab");
+
+        for (String c: invalid) {
+            Exception e = expectThrows(IllegalArgumentException.class,
+                () -> CSV.format(reqWithParam("delimiter", c), emptyData()));
+            String msg;
+            if (c.length() == 1) {
+                msg = c.equals("\t")
+                    ? "illegal delimiter [TAB] specified as delimiter for the [csv] format; choose the [tsv] format instead"
+                    : "illegal reserved character specified as delimiter [" + c + "]";
+            } else {
+                msg = "invalid " + (c.length() > 0 ? "multi-character" : "empty") + " delimiter [" + c + "]";
+            }
+            assertEquals(msg, e.getMessage());
+        }
+    }
+
+
    private static SqlQueryResponse emptyData() {
        return new SqlQueryResponse(null, Mode.JDBC, false, singletonList(new ColumnInfo("index", "name", "keyword")), emptyList());
    }
@ -153,7 +208,7 @@ public class TextFormatTests extends ESTestCase {
        return new FakeRestRequest();
    }

-    private static RestRequest reqNoHeader() {
-        return new FakeRestRequest.Builder(NamedXContentRegistry.EMPTY).withParams(singletonMap("header", "absent")).build();
+    private static RestRequest reqWithParam(String paramName, String paramVal) {
+        return new FakeRestRequest.Builder(NamedXContentRegistry.EMPTY).withParams(singletonMap(paramName, paramVal)).build();
    }
 }