From 030ed911d4073843dc76a655e6fc8066b310af46 Mon Sep 17 00:00:00 2001 From: Paul Rogers Date: Wed, 5 Apr 2023 21:09:33 -0700 Subject: [PATCH] Temporarily revert extended table functions for Druid 26 (#14019) --- docs/multi-stage-query/reference.md | 366 ------------------ .../apache/druid/msq/guice/MSQSqlModule.java | 6 - .../druid/testsEx/msq/ITMultiStageQuery.java | 2 + .../sql/calcite/CalciteIngestionDmlTest.java | 3 + 4 files changed, 5 insertions(+), 372 deletions(-) diff --git a/docs/multi-stage-query/reference.md b/docs/multi-stage-query/reference.md index d34f3e0cf9c..3204b5646ef 100644 --- a/docs/multi-stage-query/reference.md +++ b/docs/multi-stage-query/reference.md @@ -88,372 +88,6 @@ can precede the column list: `EXTEND (timestamp VARCHAR...)`. For more information, see [Read external data with EXTERN](concepts.md#extern). -### `HTTP`, `INLINE`, `LOCALFILES` and `S3` Functions - -While `EXTERN` allows you to specify an external table using JSON, other table functions allow you -describe the external table using SQL syntax. Each function works for one specific kind of input -source. You provide properties using SQL named arguments. The row signature is given using the -Druid SQL `EXTEND` keyword using SQL syntax and types. - -The set of table functions and formats is preliminary in this release. - -Function format: - -```sql -SELECT - -FROM TABLE( - http( - userName => 'bob', - password => 'secret', - uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'], - format => 'csv' - ) - ) EXTEND (x VARCHAR, y VARCHAR, z BIGINT) -``` - -For each function, you provide: - -* The function name indicates the kind of input source: `http`, `inline` or `localfiles`. -* The function arguments correspond to a subset of the JSON fields for that input source. -* A `format` argument to indicate the desired input format. -* Additional arguments required for the selected format type. - -Note that the `EXTEND` keyword is optional. The following is equally valid (and perhaps -more convenient): - -```sql -SELECT - -FROM TABLE( - http( - userName => 'bob', - password => 'secret', - uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'], - format => 'csv' - ) - ) (x VARCHAR, y VARCHAR, z BIGINT) -``` - -#### Function Arguments - -These table functions are intended for use with the SQL by-name argument syntax -as shown above. Because the functions include all parameters for all formats, -using positional calls is both cumbersome and error-prone. - -Function argument names are generally the same as the JSON field names, except -as noted below. Each argument has a SQL type which matches the JSON type. For -arguments that take a string list in JSON, use the SQL `ARRAY[...]` syntax in -SQL as shown in the above example. - -Array parameters are good candidates for use in parameterized queries. That is: - -```sql -SELECT - -FROM TABLE( - http( - userName => 'bob', - password => 'secret', - uris => ?, - format => 'csv' - ) - ) (x VARCHAR, y VARCHAR, z BIGINT) -``` - -Provide the list of URIs (in this case) as a query parameter in each ingest. Doing -so is simpler than writing a script to insert the array into the SQL text. - -#### `HTTP` Function - -The `HTTP` table function represents the -[HTTP input source](../ingestion/native-batch-input-source.md#http-input-source) -to read from an HTTP server. The function accepts the following arguments: - -* `userName` (`VARCHAR`) - Same as JSON `httpAuthenticationUsername`. -* `password` (`VARCHAR`) - Same as`httpAuthenticationPassword` when used with the default option. -* `passwordEnvVar` (`VARCHAR`) - Same as the HTTP `httpAuthenticationPassword` when used with - the `"type": "environment"` option. -* `uris` (`ARRAY` of `VARCHAR`) - -#### `INLINE` Function - -The `INLINE` table function represents the -[Inline input source](../ingestion/native-batch-input-source.md#inline-input-source) -which provides data directly in the table function. Parameter: - -* `data` (`ARRAY` of `VARCHAR`) - Data lines, without a trailing newline, as an array. - -Example: - -```sql -SELECT ... -FROM TABLE( - inline( - data => ARRAY[ - 'a,b', - 'c,d'], - format => 'csv' - ) - ) (x VARCHAR, y VARCHAR) -``` - -#### `LOCALFILES` Function - -The `LOCALFILES` table function represents the -[Local input source](../ingestion/native-batch-input-source.md#local-input-source) which reads -files from the file system of the node running Druid. This is most useful for single-node -installations. The function accepts the following parameters: - -* `baseDir` -* `filter` -* `files` - -When the local files input source is used directly in an `extern` function, or ingestion spec, you -can provide either `baseDir` and `filter` or `files` but not both. This function, however, allows -you to provide any of the following combinations: - -* `baseDir` - Matches all files in the given directory. (Assumes the filter is `*`.) -* `baseDir` and `filter` - Match files in the given directory using the filter. -* `baseDir` and `files` - A set of files relative to `baseDir`. -* `files` - The files should be absolute paths, else they will be computed relative to Druid's - working directory (usually the Druid install directory.) - -Examples: - -To read All files in /tmp, which must be CSV files: - -```sql -SELECT ... -FROM TABLE( - localfiles( - baseDir => '/tmp', - format => 'csv') - ) (x VARCHAR, y VARCHAR) -``` - -Some additional variations (omitting the common bits): - -```sql - -- CSV files in /tmp - localfiles(baseDir => '/tmp', - filter => '*.csv', - format => 'csv') - - -- /tmp/a.csv and /tmp/b.csv - localfiles(baseDir => '/tmp', - files => ARRAY['a.csv', 'b.csv'], - format => 'csv') - - -- /tmp/a.csv and /tmp/b.csv - localfiles(files => ARRAY['/tmp/a.csv', '/tmp/b.csv'], - format => 'csv') -``` - -#### `S3` Function - -The `S3` table function represents the -[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) which reads -files from an S3 bucket. The function accepts the following parameters to specify the -objects to read: - -* `uris` (`ARRAY` of `VARCHAR`) -* `prefix` (`VARCHAR`) - Corresponds to the JSON `prefixes` property, but allows a single - prefix. -* `bucket` (`VARCHAR`) - Corresponds to the `bucket` field of the `objects` JSON field. SQL - does not have syntax for an array of objects. Instead, this function takes a single bucket, - and one or more objects within that bucket. -* `paths` (`ARRAY` of `VARCHAR`) - Corresponds to the `path` fields of the `object` JSON field. - All paths are within the single `bucket` parameter. - -The S3 input source accepts one of the following patterns: - -* `uris` - A list of fully-qualified object URIs. -* `prefixes` - A list of fully-qualified "folder" prefixes. -* `bucket` and `paths` - A list of objects relative to the given bucket path. - -The `S3` function also accepts the following security parameters: - -* `accessKeyId` (`VARCHAR`) -* `secretAccessKey` (`VARCHAR`) -* `assumeRoleArn` (`VARCHAR`) - -The `S3` table function does not support either the `clientConfig` or `proxyConfig` -JSON properties. - -If you need the full power of the S3 input source, then consider the use of the `extern` -function, which accepts the full S3 input source serialized as JSON. Alternatively, -create a catalog external table that has the full set of properties, leaving just the -`uris` or `paths` to be provided at query time. - -Examples, each of which correspond to an example on the -[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) page. -The examples omit the format and schema; however you must remember to provide those -in an actual query. - -```sql -SELECT ... -FROM TABLE(S3( - uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'], - format => 'csv')) - ) (x VARCHAR, y VARCHAR) -``` - -Additional variations, omitting the common bits: - -```sql - S3(prefixes => ARRAY['s3://foo/bar/', 's3://bar/foo/'])) -``` - -```sql - -- Not an exact match for the JSON example: the S3 function allows - -- only one bucket. - S3(bucket => 's3://foo`, - paths => ARRAY['bar/file1.json', 'foo/file2.json']) -``` - -```sql - S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'], - accessKeyId => 'KLJ78979SDFdS2', - secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd') -``` - -```sql - S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'], - accessKeyId => 'KLJ78979SDFdS2', - secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd', - assumeRoleArn => 'arn:aws:iam::2981002874992:role/role-s3') -``` - -#### Input Format - -Each of the table functions above requires that you specify a format using the `format` -parameter which accepts a value the same as the format names used for `EXTERN` and described -for [each input source](../ingestion/native-batch-input-source.md). - -#### CSV Format - -The `csv` format selects the [CSV input format](../ingestion/data-formats.md#csv). -Parameters: - -* `listDelimiter` (`VARCHAR`) -* `skipHeaderRows` (`BOOLEAN`) - -Example for a CSV format with a list delimiter and where we want to skip the first -input row: - -```sql -SELECT ... -FROM TABLE( - inline( - data => ARRAY[ - 'skip me', - 'a;foo,b', - 'c;bar,d'], - format => 'csv', - listDelimiter => ';', - skipHeaderRows => 1 - ) - ) (x VARCHAR, y VARCHAR) -``` - -#### Delimited Text Format - -The `tsv` format selects the [TSV (Delimited) input format](../ingestion/data-formats.md#tsv-delimited). -Parameters: - -* `delimiter` (`VARCHAR`) -* `listDelimiter` (`VARCHAR`) -* `skipHeaderRows` (`BOOLEAN`) - -Example for a pipe-separated format with a list delimiter and where we want to skip the first -input row: - -```sql -SELECT ... -FROM TABLE( - inline( - data => ARRAY[ - 'skip me', - 'a;foo|b', - 'c;bar|d'], - format => 'tsv', - listDelimiter => ';', - skipHeaderRows => 1, - delimiter => '|' - ) - ) (x VARCHAR, y VARCHAR) -``` - -#### JSON Format - -The `json` format selects the -[JSON input format](../ingestion/data-formats.html#json). -The JSON format accepts no additional parameters. - -Example: - -```sql -SELECT ... -FROM TABLE( - inline( - data => ARRAY['{"x": "foo", "y": "bar"}'], - format => 'json') - ) (x VARCHAR, y VARCHAR) -``` - -The JSON function allows columns to be of type `TYPE('COMPLEX')` which indicates that the column contains -some form of complex JSON: a JSON object, a JSON array, or an array of JSON objects or arrays. -Note that the case must exactly match that given: upper case `COMPLEX`, lower case `json`. -The SQL type simply names a native Druid type. However, the actual -segment column produced may be of some other type if Druid infers that it can use a simpler type -instead. - -### Parameters - -Starting with the Druid 26.0 release, you can use query parameters with MSQ queries. You may find -that you periodically ingest a new set of files into Druid. Often, the bulk of the query is identical -for each ingestion: only the list of files (or URIs or objects) changes. For example, for the `S3` -input source, you will likely ingest from the same bucket and security setup in -each query; only the specific objects will change. Consider using a query parameter -to pass the object names: - -```sql -INSERT INTO ... -SELECT ... -FROM TABLE(S3(bucket => 's3://foo`, - accessKeyId => ?, - paths => ?, - format => JSON)) - (a VARCHAR, b BIGINT, ...) -``` - -This same technique can be used with the `uris` or `prefixes` parameters instead. - -Function arguments that take an array parameter require an array function in your JSON request. -For example: - -```json -{ - "query" : "INSERT INTO ... -SELECT ... -FROM TABLE(S3(bucket => 's3://foo`, - accessKeyId => ?, - paths => ?, - format => JSON)) - (a VARCHAR, b BIGINT, ...)", - "parameters": [ - { "type": "VARCHAR", "value": "ABCD-EF01"}, - { "type": "VARCHAR", "value": [ - "foo.csv", "bar.csv" - ] } - ] -} -``` - -The type in the above example is the type of each element. It must be `VARCHAR` for all the array -parameters for functions described on this page. - ### `INSERT` Use the `INSERT` statement to insert data. diff --git a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java index 5335e40b0e9..8e381e50bd0 100644 --- a/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java +++ b/extensions-core/multi-stage-query/src/main/java/org/apache/druid/msq/guice/MSQSqlModule.java @@ -32,9 +32,6 @@ import org.apache.druid.msq.sql.MSQTaskSqlEngine; import org.apache.druid.sql.SqlStatementFactory; import org.apache.druid.sql.SqlToolbox; import org.apache.druid.sql.calcite.external.ExternalOperatorConversion; -import org.apache.druid.sql.calcite.external.HttpOperatorConversion; -import org.apache.druid.sql.calcite.external.InlineOperatorConversion; -import org.apache.druid.sql.calcite.external.LocalOperatorConversion; import org.apache.druid.sql.guice.SqlBindings; import java.util.List; @@ -62,9 +59,6 @@ public class MSQSqlModule implements DruidModule // Set up the EXTERN macro. SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class); - SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class); - SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class); - SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class); } @Provides diff --git a/integration-tests-ex/cases/src/test/java/org/apache/druid/testsEx/msq/ITMultiStageQuery.java b/integration-tests-ex/cases/src/test/java/org/apache/druid/testsEx/msq/ITMultiStageQuery.java index 8084b0e9bf6..b70329bd568 100644 --- a/integration-tests-ex/cases/src/test/java/org/apache/druid/testsEx/msq/ITMultiStageQuery.java +++ b/integration-tests-ex/cases/src/test/java/org/apache/druid/testsEx/msq/ITMultiStageQuery.java @@ -28,6 +28,7 @@ import org.apache.druid.testing.utils.MsqTestQueryHelper; import org.apache.druid.testsEx.categories.MultiStageQuery; import org.apache.druid.testsEx.config.DruidTestRunner; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.runner.RunWith; @@ -111,6 +112,7 @@ public class ITMultiStageQuery } @Test + @Ignore("localfiles() is disabled") public void testMsqIngestionAndQueryingWithLocalFn() throws Exception { String datasource = "dst"; diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteIngestionDmlTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteIngestionDmlTest.java index 9ce46185a41..50d0bc445ba 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteIngestionDmlTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteIngestionDmlTest.java @@ -154,6 +154,9 @@ public class CalciteIngestionDmlTest extends BaseCalciteQueryTest // Set up the EXTERN macro. SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class); + + // Enable the extended table functions for testing even though these + // are not enabled in production in Druid 26. SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class); SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class); SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);