Temporarily revert extended table functions for Druid 26 (#14019)

This commit is contained in:
Paul Rogers 2023-04-05 21:09:33 -07:00 committed by GitHub
parent b98eed8fb8
commit 030ed911d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 5 additions and 372 deletions

View File

@ -88,372 +88,6 @@ can precede the column list: `EXTEND (timestamp VARCHAR...)`.
For more information, see [Read external data with EXTERN](concepts.md#extern).
### `HTTP`, `INLINE`, `LOCALFILES` and `S3` Functions
While `EXTERN` allows you to specify an external table using JSON, other table functions allow you
describe the external table using SQL syntax. Each function works for one specific kind of input
source. You provide properties using SQL named arguments. The row signature is given using the
Druid SQL `EXTEND` keyword using SQL syntax and types.
The set of table functions and formats is preliminary in this release.
Function format:
```sql
SELECT
<column>
FROM TABLE(
http(
userName => 'bob',
password => 'secret',
uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'],
format => 'csv'
)
) EXTEND (x VARCHAR, y VARCHAR, z BIGINT)
```
For each function, you provide:
* The function name indicates the kind of input source: `http`, `inline` or `localfiles`.
* The function arguments correspond to a subset of the JSON fields for that input source.
* A `format` argument to indicate the desired input format.
* Additional arguments required for the selected format type.
Note that the `EXTEND` keyword is optional. The following is equally valid (and perhaps
more convenient):
```sql
SELECT
<column>
FROM TABLE(
http(
userName => 'bob',
password => 'secret',
uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'],
format => 'csv'
)
) (x VARCHAR, y VARCHAR, z BIGINT)
```
#### Function Arguments
These table functions are intended for use with the SQL by-name argument syntax
as shown above. Because the functions include all parameters for all formats,
using positional calls is both cumbersome and error-prone.
Function argument names are generally the same as the JSON field names, except
as noted below. Each argument has a SQL type which matches the JSON type. For
arguments that take a string list in JSON, use the SQL `ARRAY[...]` syntax in
SQL as shown in the above example.
Array parameters are good candidates for use in parameterized queries. That is:
```sql
SELECT
<column>
FROM TABLE(
http(
userName => 'bob',
password => 'secret',
uris => ?,
format => 'csv'
)
) (x VARCHAR, y VARCHAR, z BIGINT)
```
Provide the list of URIs (in this case) as a query parameter in each ingest. Doing
so is simpler than writing a script to insert the array into the SQL text.
#### `HTTP` Function
The `HTTP` table function represents the
[HTTP input source](../ingestion/native-batch-input-source.md#http-input-source)
to read from an HTTP server. The function accepts the following arguments:
* `userName` (`VARCHAR`) - Same as JSON `httpAuthenticationUsername`.
* `password` (`VARCHAR`) - Same as`httpAuthenticationPassword` when used with the default option.
* `passwordEnvVar` (`VARCHAR`) - Same as the HTTP `httpAuthenticationPassword` when used with
the `"type": "environment"` option.
* `uris` (`ARRAY` of `VARCHAR`)
#### `INLINE` Function
The `INLINE` table function represents the
[Inline input source](../ingestion/native-batch-input-source.md#inline-input-source)
which provides data directly in the table function. Parameter:
* `data` (`ARRAY` of `VARCHAR`) - Data lines, without a trailing newline, as an array.
Example:
```sql
SELECT ...
FROM TABLE(
inline(
data => ARRAY[
'a,b',
'c,d'],
format => 'csv'
)
) (x VARCHAR, y VARCHAR)
```
#### `LOCALFILES` Function
The `LOCALFILES` table function represents the
[Local input source](../ingestion/native-batch-input-source.md#local-input-source) which reads
files from the file system of the node running Druid. This is most useful for single-node
installations. The function accepts the following parameters:
* `baseDir`
* `filter`
* `files`
When the local files input source is used directly in an `extern` function, or ingestion spec, you
can provide either `baseDir` and `filter` or `files` but not both. This function, however, allows
you to provide any of the following combinations:
* `baseDir` - Matches all files in the given directory. (Assumes the filter is `*`.)
* `baseDir` and `filter` - Match files in the given directory using the filter.
* `baseDir` and `files` - A set of files relative to `baseDir`.
* `files` - The files should be absolute paths, else they will be computed relative to Druid's
working directory (usually the Druid install directory.)
Examples:
To read All files in /tmp, which must be CSV files:
```sql
SELECT ...
FROM TABLE(
localfiles(
baseDir => '/tmp',
format => 'csv')
) (x VARCHAR, y VARCHAR)
```
Some additional variations (omitting the common bits):
```sql
-- CSV files in /tmp
localfiles(baseDir => '/tmp',
filter => '*.csv',
format => 'csv')
-- /tmp/a.csv and /tmp/b.csv
localfiles(baseDir => '/tmp',
files => ARRAY['a.csv', 'b.csv'],
format => 'csv')
-- /tmp/a.csv and /tmp/b.csv
localfiles(files => ARRAY['/tmp/a.csv', '/tmp/b.csv'],
format => 'csv')
```
#### `S3` Function
The `S3` table function represents the
[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) which reads
files from an S3 bucket. The function accepts the following parameters to specify the
objects to read:
* `uris` (`ARRAY` of `VARCHAR`)
* `prefix` (`VARCHAR`) - Corresponds to the JSON `prefixes` property, but allows a single
prefix.
* `bucket` (`VARCHAR`) - Corresponds to the `bucket` field of the `objects` JSON field. SQL
does not have syntax for an array of objects. Instead, this function takes a single bucket,
and one or more objects within that bucket.
* `paths` (`ARRAY` of `VARCHAR`) - Corresponds to the `path` fields of the `object` JSON field.
All paths are within the single `bucket` parameter.
The S3 input source accepts one of the following patterns:
* `uris` - A list of fully-qualified object URIs.
* `prefixes` - A list of fully-qualified "folder" prefixes.
* `bucket` and `paths` - A list of objects relative to the given bucket path.
The `S3` function also accepts the following security parameters:
* `accessKeyId` (`VARCHAR`)
* `secretAccessKey` (`VARCHAR`)
* `assumeRoleArn` (`VARCHAR`)
The `S3` table function does not support either the `clientConfig` or `proxyConfig`
JSON properties.
If you need the full power of the S3 input source, then consider the use of the `extern`
function, which accepts the full S3 input source serialized as JSON. Alternatively,
create a catalog external table that has the full set of properties, leaving just the
`uris` or `paths` to be provided at query time.
Examples, each of which correspond to an example on the
[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) page.
The examples omit the format and schema; however you must remember to provide those
in an actual query.
```sql
SELECT ...
FROM TABLE(S3(
uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
format => 'csv'))
) (x VARCHAR, y VARCHAR)
```
Additional variations, omitting the common bits:
```sql
S3(prefixes => ARRAY['s3://foo/bar/', 's3://bar/foo/']))
```
```sql
-- Not an exact match for the JSON example: the S3 function allows
-- only one bucket.
S3(bucket => 's3://foo`,
paths => ARRAY['bar/file1.json', 'foo/file2.json'])
```
```sql
S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
accessKeyId => 'KLJ78979SDFdS2',
secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd')
```
```sql
S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
accessKeyId => 'KLJ78979SDFdS2',
secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd',
assumeRoleArn => 'arn:aws:iam::2981002874992:role/role-s3')
```
#### Input Format
Each of the table functions above requires that you specify a format using the `format`
parameter which accepts a value the same as the format names used for `EXTERN` and described
for [each input source](../ingestion/native-batch-input-source.md).
#### CSV Format
The `csv` format selects the [CSV input format](../ingestion/data-formats.md#csv).
Parameters:
* `listDelimiter` (`VARCHAR`)
* `skipHeaderRows` (`BOOLEAN`)
Example for a CSV format with a list delimiter and where we want to skip the first
input row:
```sql
SELECT ...
FROM TABLE(
inline(
data => ARRAY[
'skip me',
'a;foo,b',
'c;bar,d'],
format => 'csv',
listDelimiter => ';',
skipHeaderRows => 1
)
) (x VARCHAR, y VARCHAR)
```
#### Delimited Text Format
The `tsv` format selects the [TSV (Delimited) input format](../ingestion/data-formats.md#tsv-delimited).
Parameters:
* `delimiter` (`VARCHAR`)
* `listDelimiter` (`VARCHAR`)
* `skipHeaderRows` (`BOOLEAN`)
Example for a pipe-separated format with a list delimiter and where we want to skip the first
input row:
```sql
SELECT ...
FROM TABLE(
inline(
data => ARRAY[
'skip me',
'a;foo|b',
'c;bar|d'],
format => 'tsv',
listDelimiter => ';',
skipHeaderRows => 1,
delimiter => '|'
)
) (x VARCHAR, y VARCHAR)
```
#### JSON Format
The `json` format selects the
[JSON input format](../ingestion/data-formats.html#json).
The JSON format accepts no additional parameters.
Example:
```sql
SELECT ...
FROM TABLE(
inline(
data => ARRAY['{"x": "foo", "y": "bar"}'],
format => 'json')
) (x VARCHAR, y VARCHAR)
```
The JSON function allows columns to be of type `TYPE('COMPLEX<json>')` which indicates that the column contains
some form of complex JSON: a JSON object, a JSON array, or an array of JSON objects or arrays.
Note that the case must exactly match that given: upper case `COMPLEX`, lower case `json`.
The SQL type simply names a native Druid type. However, the actual
segment column produced may be of some other type if Druid infers that it can use a simpler type
instead.
### Parameters
Starting with the Druid 26.0 release, you can use query parameters with MSQ queries. You may find
that you periodically ingest a new set of files into Druid. Often, the bulk of the query is identical
for each ingestion: only the list of files (or URIs or objects) changes. For example, for the `S3`
input source, you will likely ingest from the same bucket and security setup in
each query; only the specific objects will change. Consider using a query parameter
to pass the object names:
```sql
INSERT INTO ...
SELECT ...
FROM TABLE(S3(bucket => 's3://foo`,
accessKeyId => ?,
paths => ?,
format => JSON))
(a VARCHAR, b BIGINT, ...)
```
This same technique can be used with the `uris` or `prefixes` parameters instead.
Function arguments that take an array parameter require an array function in your JSON request.
For example:
```json
{
"query" : "INSERT INTO ...
SELECT ...
FROM TABLE(S3(bucket => 's3://foo`,
accessKeyId => ?,
paths => ?,
format => JSON))
(a VARCHAR, b BIGINT, ...)",
"parameters": [
{ "type": "VARCHAR", "value": "ABCD-EF01"},
{ "type": "VARCHAR", "value": [
"foo.csv", "bar.csv"
] }
]
}
```
The type in the above example is the type of each element. It must be `VARCHAR` for all the array
parameters for functions described on this page.
### `INSERT`
Use the `INSERT` statement to insert data.

View File

@ -32,9 +32,6 @@ import org.apache.druid.msq.sql.MSQTaskSqlEngine;
import org.apache.druid.sql.SqlStatementFactory;
import org.apache.druid.sql.SqlToolbox;
import org.apache.druid.sql.calcite.external.ExternalOperatorConversion;
import org.apache.druid.sql.calcite.external.HttpOperatorConversion;
import org.apache.druid.sql.calcite.external.InlineOperatorConversion;
import org.apache.druid.sql.calcite.external.LocalOperatorConversion;
import org.apache.druid.sql.guice.SqlBindings;
import java.util.List;
@ -62,9 +59,6 @@ public class MSQSqlModule implements DruidModule
// Set up the EXTERN macro.
SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class);
SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class);
SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class);
SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);
}
@Provides

View File

@ -28,6 +28,7 @@ import org.apache.druid.testing.utils.MsqTestQueryHelper;
import org.apache.druid.testsEx.categories.MultiStageQuery;
import org.apache.druid.testsEx.config.DruidTestRunner;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
@ -111,6 +112,7 @@ public class ITMultiStageQuery
}
@Test
@Ignore("localfiles() is disabled")
public void testMsqIngestionAndQueryingWithLocalFn() throws Exception
{
String datasource = "dst";

View File

@ -154,6 +154,9 @@ public class CalciteIngestionDmlTest extends BaseCalciteQueryTest
// Set up the EXTERN macro.
SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class);
// Enable the extended table functions for testing even though these
// are not enabled in production in Druid 26.
SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class);
SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class);
SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);