Temporarily revert extended table functions for Druid 26 (#14019)

2023-04-05 21:09:33 -07:00 · 2023-04-05 21:09:33 -07:00 · 030ed911d4
parent b98eed8fb8
commit 030ed911d4
4 changed files with 5 additions and 372 deletions
--- a/docs/multi-stage-query/reference.md
+++ b/docs/multi-stage-query/reference.md
@ -88,372 +88,6 @@ can precede the column list: `EXTEND (timestamp VARCHAR...)`.

 For more information, see [Read external data with EXTERN](concepts.md#extern).

-### `HTTP`, `INLINE`, `LOCALFILES` and `S3` Functions
-
-While `EXTERN` allows you to specify an external table using JSON, other table functions allow you
-describe the external table using SQL syntax. Each function works for one specific kind of input
-source. You provide properties using SQL named arguments. The row signature is given using the
-Druid SQL `EXTEND` keyword using SQL syntax and types.
-
-The set of table functions and formats is preliminary in this release.
-
-Function format:
-
-```sql
-SELECT
- <column>
-FROM TABLE(
-  http(
-    userName => 'bob',
-    password => 'secret',
-    uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'],
-    format => 'csv'
-    )
-  ) EXTEND (x VARCHAR, y VARCHAR, z BIGINT)
-```
-
-For each function, you provide:
-
-* The function name indicates the kind of input source: `http`, `inline` or `localfiles`.
-* The function arguments correspond to a subset of the JSON fields for that input source.
-* A `format` argument to indicate the desired input format.
-* Additional arguments required for the selected format type.
-
-Note that the `EXTEND` keyword is optional. The following is equally valid (and perhaps
-more convenient):
-
-```sql
-SELECT
- <column>
-FROM TABLE(
-  http(
-    userName => 'bob',
-    password => 'secret',
-    uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'],
-    format => 'csv'
-    )
-  ) (x VARCHAR, y VARCHAR, z BIGINT)
-```
-
-#### Function Arguments
-
-These table functions are intended for use with the SQL by-name argument syntax
-as shown above. Because the functions include all parameters for all formats,
-using positional calls is both cumbersome and error-prone.
-
-Function argument names are generally the same as the JSON field names, except
-as noted below. Each argument has a SQL type which matches the JSON type. For
-arguments that take a string list in JSON, use the SQL `ARRAY[...]` syntax in
-SQL as shown in the above example.
-
-Array parameters are good candidates for use in parameterized queries. That is:
-
-```sql
-SELECT
- <column>
-FROM TABLE(
-  http(
-    userName => 'bob',
-    password => 'secret',
-    uris => ?,
-    format => 'csv'
-    )
-  ) (x VARCHAR, y VARCHAR, z BIGINT)
-```
-
-Provide the list of URIs (in this case) as a query parameter in each ingest. Doing
-so is simpler than writing a script to insert the array into the SQL text.
-
-#### `HTTP` Function
-
-The `HTTP` table function represents the
-[HTTP input source](../ingestion/native-batch-input-source.md#http-input-source)
-to read from an HTTP server. The function accepts the following arguments:
-
-* `userName` (`VARCHAR`) -  Same as JSON `httpAuthenticationUsername`.
-* `password`  (`VARCHAR`) - Same as`httpAuthenticationPassword` when used with the default option.
-* `passwordEnvVar` (`VARCHAR`) - Same as the HTTP `httpAuthenticationPassword` when used with
-  the `"type": "environment"` option.
-* `uris` (`ARRAY` of `VARCHAR`)
-
-#### `INLINE` Function
-
-The `INLINE` table function represents the
-[Inline input source](../ingestion/native-batch-input-source.md#inline-input-source)
-which provides data directly in the table function. Parameter:
-
-* `data` (`ARRAY` of `VARCHAR`) - Data lines, without a trailing newline, as an array.
-
-Example:
-
-```sql
-SELECT ...
-FROM TABLE(
-  inline(
-    data => ARRAY[
-    	'a,b',
-    	'c,d'],
-    format => 'csv'
-    )
-  ) (x VARCHAR, y VARCHAR)
-```
-
-#### `LOCALFILES` Function
-
-The `LOCALFILES` table function represents the
-[Local input source](../ingestion/native-batch-input-source.md#local-input-source) which reads
-files from the file system of the node running Druid. This is most useful for single-node
-installations. The function accepts the following parameters:
-
-* `baseDir`
-* `filter`
-* `files`
-
-When the local files input source is used directly in an `extern` function, or ingestion spec, you
-can provide either `baseDir` and `filter` or `files` but not both. This function, however, allows
-you to provide any of the following combinations:
-
-* `baseDir` - Matches all files in the given directory. (Assumes the filter is `*`.)
-* `baseDir` and `filter` - Match files in the given directory using the filter.
-* `baseDir` and `files` - A set of files relative to `baseDir`.
-* `files` - The files should be absolute paths, else they will be computed relative to Druid's
-  working directory (usually the Druid install directory.)
-
-Examples:
-
-To read All files in /tmp, which must be CSV files:
-
-```sql
-SELECT ...
-FROM TABLE(
-  localfiles(
-      baseDir => '/tmp',
-      format => 'csv')
-  ) (x VARCHAR, y VARCHAR)
-```
-
-Some additional variations (omitting the common bits):
-
-```sql
-  -- CSV files in /tmp
-  localfiles(baseDir => '/tmp',
-             filter => '*.csv',
-             format => 'csv')
-
-  -- /tmp/a.csv and /tmp/b.csv
-  localfiles(baseDir => '/tmp',
-             files => ARRAY['a.csv', 'b.csv'],
-             format => 'csv')
-
-  -- /tmp/a.csv and /tmp/b.csv
-  localfiles(files => ARRAY['/tmp/a.csv', '/tmp/b.csv'],
-             format => 'csv')
-```
-
-#### `S3` Function
-
-The `S3` table function represents the
-[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) which reads
-files from an S3 bucket. The function accepts the following parameters to specify the
-objects to read:
-
-* `uris` (`ARRAY` of `VARCHAR`)
-* `prefix` (`VARCHAR`) - Corresponds to the JSON `prefixes` property, but allows a single
-  prefix.
-* `bucket` (`VARCHAR`) - Corresponds to the `bucket` field of the `objects` JSON field. SQL
-  does not have syntax for an array of objects. Instead, this function takes a single bucket,
-  and one or more objects within that bucket.
-* `paths` (`ARRAY` of `VARCHAR`) - Corresponds to the `path` fields of the `object` JSON field.
-  All paths are within the single `bucket` parameter.
-
-The S3 input source accepts one of the following patterns:
-
-* `uris` - A list of fully-qualified object URIs.
-* `prefixes` - A list of fully-qualified "folder" prefixes.
-* `bucket` and `paths` - A list of objects relative to the given bucket path.
-
-The `S3` function also accepts the following security parameters:
-
-* `accessKeyId` (`VARCHAR`)
-* `secretAccessKey` (`VARCHAR`)
-* `assumeRoleArn` (`VARCHAR`)
-
-The `S3` table function does not support either the `clientConfig` or `proxyConfig`
-JSON properties.
-
-If you need the full power of the S3 input source, then consider the use of the `extern`
-function, which accepts the full S3 input source serialized as JSON. Alternatively,
-create a catalog external table that has the full set of properties, leaving just the
-`uris` or `paths` to be provided at query time.
-
-Examples, each of which correspond to an example on the
-[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) page.
-The examples omit the format and schema; however you must remember to provide those
-in an actual query.
-
-```sql
-SELECT ...
-FROM TABLE(S3(
-    uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
-    format => 'csv'))
-  ) (x VARCHAR, y VARCHAR)
-```
-
-Additional variations, omitting the common bits:
-
-```sql
-  S3(prefixes => ARRAY['s3://foo/bar/', 's3://bar/foo/']))
-```
-
-```sql
-  -- Not an exact match for the JSON example: the S3 function allows
-  -- only one bucket.
-  S3(bucket => 's3://foo`,
-           paths => ARRAY['bar/file1.json', 'foo/file2.json'])
-```
-
-```sql
-  S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
-           accessKeyId => 'KLJ78979SDFdS2',
-           secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd')
-```
-
-```sql
-  S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
-           accessKeyId => 'KLJ78979SDFdS2',
-           secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd',
-           assumeRoleArn => 'arn:aws:iam::2981002874992:role/role-s3')
-```
-
-#### Input Format
-
-Each of the table functions above requires that you specify a format using the `format`
-parameter which accepts a value the same as the format names used for `EXTERN` and described
-for [each input source](../ingestion/native-batch-input-source.md).
-
-#### CSV Format
-
-The `csv` format selects the [CSV input format](../ingestion/data-formats.md#csv).
-Parameters:
-
-* `listDelimiter` (`VARCHAR`)
-* `skipHeaderRows` (`BOOLEAN`)
-
-Example for a CSV format with a list delimiter and where we want to skip the first
-input row:
-
-```sql
-SELECT ...
-FROM TABLE(
-  inline(
-    data => ARRAY[
-        'skip me',
-    	'a;foo,b',
-    	'c;bar,d'],
-    format => 'csv',
-    listDelimiter => ';',
-    skipHeaderRows => 1
-    )
-  ) (x VARCHAR, y VARCHAR)
-```
-
-#### Delimited Text Format
-
-The `tsv` format selects the [TSV (Delimited) input format](../ingestion/data-formats.md#tsv-delimited).
-Parameters:
-
-* `delimiter` (`VARCHAR`)
-* `listDelimiter` (`VARCHAR`)
-* `skipHeaderRows` (`BOOLEAN`)
-
-Example for a pipe-separated format with a list delimiter and where we want to skip the first
-input row:
-
-```sql
-SELECT ...
-FROM TABLE(
-  inline(
-    data => ARRAY[
-        'skip me',
-    	'a;foo|b',
-    	'c;bar|d'],
-    format => 'tsv',
-    listDelimiter => ';',
-    skipHeaderRows => 1,
-    delimiter => '|'
-    )
-  ) (x VARCHAR, y VARCHAR)
-```
-
-#### JSON Format
-
-The `json` format selects the
-[JSON input format](../ingestion/data-formats.html#json).
-The JSON format accepts no additional parameters.
-
-Example:
-
-```sql
-SELECT ...
-FROM TABLE(
-  inline(
-    data => ARRAY['{"x": "foo", "y": "bar"}'],
-    format => 'json')
-  ) (x VARCHAR, y VARCHAR)
-```
-
-The JSON function allows columns to be of type `TYPE('COMPLEX<json>')` which indicates that the column contains
-some form of complex JSON: a JSON object, a JSON array, or an array of JSON objects or arrays.
-Note that the case must exactly match that given: upper case `COMPLEX`, lower case `json`.
-The SQL type simply names a native Druid type. However, the actual
-segment column produced may be of some other type if Druid infers that it can use a simpler type
-instead.
-
-### Parameters
-
-Starting with the Druid 26.0 release, you can use query parameters with MSQ queries. You may find
-that you periodically ingest a new set of files into Druid. Often, the bulk of the query is identical
-for each ingestion: only the list of files (or URIs or objects) changes. For example, for the `S3`
-input source, you will likely ingest from the same bucket and security setup in
-each query; only the specific objects will change. Consider using a query parameter
-to pass the object names:
-
-```sql
-INSERT INTO ...
-SELECT ...
-FROM TABLE(S3(bucket => 's3://foo`,
-              accessKeyId => ?,
-              paths => ?,
-              format => JSON))
-     (a VARCHAR, b BIGINT, ...)
-```
-
-This same technique can be used with the `uris` or `prefixes` parameters instead.
-
-Function arguments that take an array parameter require an array function in your JSON request.
-For example:
-
-```json
-{
-  "query" : "INSERT INTO ...
-SELECT ...
-FROM TABLE(S3(bucket => 's3://foo`,
-              accessKeyId => ?,
-              paths => ?,
-              format => JSON))
-     (a VARCHAR, b BIGINT, ...)",
-  "parameters": [
-    { "type": "VARCHAR", "value": "ABCD-EF01"},
-    { "type": "VARCHAR", "value": [
-    	"foo.csv", "bar.csv"
-    ] }
-  ]
-}
-```
-
-The type in the above example is the type of each element. It must be `VARCHAR` for all the array
-parameters for functions described on this page.
-
 ### `INSERT`

 Use the `INSERT` statement to insert data.
--- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java
+++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java
@ -32,9 +32,6 @@ import org.apache.druid.msq.sql.MSQTaskSqlEngine;
 import org.apache.druid.sql.SqlStatementFactory;
 import org.apache.druid.sql.SqlToolbox;
 import org.apache.druid.sql.calcite.external.ExternalOperatorConversion;
-import org.apache.druid.sql.calcite.external.HttpOperatorConversion;
-import org.apache.druid.sql.calcite.external.InlineOperatorConversion;
-import org.apache.druid.sql.calcite.external.LocalOperatorConversion;
 import org.apache.druid.sql.guice.SqlBindings;

 import java.util.List;
@ -62,9 +59,6 @@ public class MSQSqlModule implements DruidModule

    // Set up the EXTERN macro.
    SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class);
-    SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class);
-    SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class);
-    SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);
  }

  @Provides
--- a/integration-tests-ex/cases/src/test/java/org/apache/druid/testsEx/msq/ITMultiStageQuery.java
+++ b/integration-tests-ex/cases/src/test/java/org/apache/druid/testsEx/msq/ITMultiStageQuery.java
@ -28,6 +28,7 @@ import org.apache.druid.testing.utils.MsqTestQueryHelper;
 import org.apache.druid.testsEx.categories.MultiStageQuery;
 import org.apache.druid.testsEx.config.DruidTestRunner;
 import org.junit.Assert;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.experimental.categories.Category;
 import org.junit.runner.RunWith;
@ -111,6 +112,7 @@ public class ITMultiStageQuery
  }

  @Test
+  @Ignore("localfiles() is disabled")
  public void testMsqIngestionAndQueryingWithLocalFn() throws Exception
  {
    String datasource = "dst";
--- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteIngestionDmlTest.java
+++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteIngestionDmlTest.java
@ -154,6 +154,9 @@ public class CalciteIngestionDmlTest extends BaseCalciteQueryTest

        // Set up the EXTERN macro.
        SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class);
+
+        // Enable the extended table functions for testing even though these
+        // are not enabled in production in Druid 26.
        SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class);
        SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class);
        SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);