mirror of https://github.com/apache/druid.git
Temporarily revert extended table functions for Druid 26 (#14019)
This commit is contained in:
parent
b98eed8fb8
commit
030ed911d4
|
@ -88,372 +88,6 @@ can precede the column list: `EXTEND (timestamp VARCHAR...)`.
|
|||
|
||||
For more information, see [Read external data with EXTERN](concepts.md#extern).
|
||||
|
||||
### `HTTP`, `INLINE`, `LOCALFILES` and `S3` Functions
|
||||
|
||||
While `EXTERN` allows you to specify an external table using JSON, other table functions allow you
|
||||
describe the external table using SQL syntax. Each function works for one specific kind of input
|
||||
source. You provide properties using SQL named arguments. The row signature is given using the
|
||||
Druid SQL `EXTEND` keyword using SQL syntax and types.
|
||||
|
||||
The set of table functions and formats is preliminary in this release.
|
||||
|
||||
Function format:
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
<column>
|
||||
FROM TABLE(
|
||||
http(
|
||||
userName => 'bob',
|
||||
password => 'secret',
|
||||
uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'],
|
||||
format => 'csv'
|
||||
)
|
||||
) EXTEND (x VARCHAR, y VARCHAR, z BIGINT)
|
||||
```
|
||||
|
||||
For each function, you provide:
|
||||
|
||||
* The function name indicates the kind of input source: `http`, `inline` or `localfiles`.
|
||||
* The function arguments correspond to a subset of the JSON fields for that input source.
|
||||
* A `format` argument to indicate the desired input format.
|
||||
* Additional arguments required for the selected format type.
|
||||
|
||||
Note that the `EXTEND` keyword is optional. The following is equally valid (and perhaps
|
||||
more convenient):
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
<column>
|
||||
FROM TABLE(
|
||||
http(
|
||||
userName => 'bob',
|
||||
password => 'secret',
|
||||
uris => ARRAY['http://example.com/foo.csv', 'http://example.com/bar.csv'],
|
||||
format => 'csv'
|
||||
)
|
||||
) (x VARCHAR, y VARCHAR, z BIGINT)
|
||||
```
|
||||
|
||||
#### Function Arguments
|
||||
|
||||
These table functions are intended for use with the SQL by-name argument syntax
|
||||
as shown above. Because the functions include all parameters for all formats,
|
||||
using positional calls is both cumbersome and error-prone.
|
||||
|
||||
Function argument names are generally the same as the JSON field names, except
|
||||
as noted below. Each argument has a SQL type which matches the JSON type. For
|
||||
arguments that take a string list in JSON, use the SQL `ARRAY[...]` syntax in
|
||||
SQL as shown in the above example.
|
||||
|
||||
Array parameters are good candidates for use in parameterized queries. That is:
|
||||
|
||||
```sql
|
||||
SELECT
|
||||
<column>
|
||||
FROM TABLE(
|
||||
http(
|
||||
userName => 'bob',
|
||||
password => 'secret',
|
||||
uris => ?,
|
||||
format => 'csv'
|
||||
)
|
||||
) (x VARCHAR, y VARCHAR, z BIGINT)
|
||||
```
|
||||
|
||||
Provide the list of URIs (in this case) as a query parameter in each ingest. Doing
|
||||
so is simpler than writing a script to insert the array into the SQL text.
|
||||
|
||||
#### `HTTP` Function
|
||||
|
||||
The `HTTP` table function represents the
|
||||
[HTTP input source](../ingestion/native-batch-input-source.md#http-input-source)
|
||||
to read from an HTTP server. The function accepts the following arguments:
|
||||
|
||||
* `userName` (`VARCHAR`) - Same as JSON `httpAuthenticationUsername`.
|
||||
* `password` (`VARCHAR`) - Same as`httpAuthenticationPassword` when used with the default option.
|
||||
* `passwordEnvVar` (`VARCHAR`) - Same as the HTTP `httpAuthenticationPassword` when used with
|
||||
the `"type": "environment"` option.
|
||||
* `uris` (`ARRAY` of `VARCHAR`)
|
||||
|
||||
#### `INLINE` Function
|
||||
|
||||
The `INLINE` table function represents the
|
||||
[Inline input source](../ingestion/native-batch-input-source.md#inline-input-source)
|
||||
which provides data directly in the table function. Parameter:
|
||||
|
||||
* `data` (`ARRAY` of `VARCHAR`) - Data lines, without a trailing newline, as an array.
|
||||
|
||||
Example:
|
||||
|
||||
```sql
|
||||
SELECT ...
|
||||
FROM TABLE(
|
||||
inline(
|
||||
data => ARRAY[
|
||||
'a,b',
|
||||
'c,d'],
|
||||
format => 'csv'
|
||||
)
|
||||
) (x VARCHAR, y VARCHAR)
|
||||
```
|
||||
|
||||
#### `LOCALFILES` Function
|
||||
|
||||
The `LOCALFILES` table function represents the
|
||||
[Local input source](../ingestion/native-batch-input-source.md#local-input-source) which reads
|
||||
files from the file system of the node running Druid. This is most useful for single-node
|
||||
installations. The function accepts the following parameters:
|
||||
|
||||
* `baseDir`
|
||||
* `filter`
|
||||
* `files`
|
||||
|
||||
When the local files input source is used directly in an `extern` function, or ingestion spec, you
|
||||
can provide either `baseDir` and `filter` or `files` but not both. This function, however, allows
|
||||
you to provide any of the following combinations:
|
||||
|
||||
* `baseDir` - Matches all files in the given directory. (Assumes the filter is `*`.)
|
||||
* `baseDir` and `filter` - Match files in the given directory using the filter.
|
||||
* `baseDir` and `files` - A set of files relative to `baseDir`.
|
||||
* `files` - The files should be absolute paths, else they will be computed relative to Druid's
|
||||
working directory (usually the Druid install directory.)
|
||||
|
||||
Examples:
|
||||
|
||||
To read All files in /tmp, which must be CSV files:
|
||||
|
||||
```sql
|
||||
SELECT ...
|
||||
FROM TABLE(
|
||||
localfiles(
|
||||
baseDir => '/tmp',
|
||||
format => 'csv')
|
||||
) (x VARCHAR, y VARCHAR)
|
||||
```
|
||||
|
||||
Some additional variations (omitting the common bits):
|
||||
|
||||
```sql
|
||||
-- CSV files in /tmp
|
||||
localfiles(baseDir => '/tmp',
|
||||
filter => '*.csv',
|
||||
format => 'csv')
|
||||
|
||||
-- /tmp/a.csv and /tmp/b.csv
|
||||
localfiles(baseDir => '/tmp',
|
||||
files => ARRAY['a.csv', 'b.csv'],
|
||||
format => 'csv')
|
||||
|
||||
-- /tmp/a.csv and /tmp/b.csv
|
||||
localfiles(files => ARRAY['/tmp/a.csv', '/tmp/b.csv'],
|
||||
format => 'csv')
|
||||
```
|
||||
|
||||
#### `S3` Function
|
||||
|
||||
The `S3` table function represents the
|
||||
[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) which reads
|
||||
files from an S3 bucket. The function accepts the following parameters to specify the
|
||||
objects to read:
|
||||
|
||||
* `uris` (`ARRAY` of `VARCHAR`)
|
||||
* `prefix` (`VARCHAR`) - Corresponds to the JSON `prefixes` property, but allows a single
|
||||
prefix.
|
||||
* `bucket` (`VARCHAR`) - Corresponds to the `bucket` field of the `objects` JSON field. SQL
|
||||
does not have syntax for an array of objects. Instead, this function takes a single bucket,
|
||||
and one or more objects within that bucket.
|
||||
* `paths` (`ARRAY` of `VARCHAR`) - Corresponds to the `path` fields of the `object` JSON field.
|
||||
All paths are within the single `bucket` parameter.
|
||||
|
||||
The S3 input source accepts one of the following patterns:
|
||||
|
||||
* `uris` - A list of fully-qualified object URIs.
|
||||
* `prefixes` - A list of fully-qualified "folder" prefixes.
|
||||
* `bucket` and `paths` - A list of objects relative to the given bucket path.
|
||||
|
||||
The `S3` function also accepts the following security parameters:
|
||||
|
||||
* `accessKeyId` (`VARCHAR`)
|
||||
* `secretAccessKey` (`VARCHAR`)
|
||||
* `assumeRoleArn` (`VARCHAR`)
|
||||
|
||||
The `S3` table function does not support either the `clientConfig` or `proxyConfig`
|
||||
JSON properties.
|
||||
|
||||
If you need the full power of the S3 input source, then consider the use of the `extern`
|
||||
function, which accepts the full S3 input source serialized as JSON. Alternatively,
|
||||
create a catalog external table that has the full set of properties, leaving just the
|
||||
`uris` or `paths` to be provided at query time.
|
||||
|
||||
Examples, each of which correspond to an example on the
|
||||
[S3 input source](../ingestion/native-batch-input-source.md#s3-input-source) page.
|
||||
The examples omit the format and schema; however you must remember to provide those
|
||||
in an actual query.
|
||||
|
||||
```sql
|
||||
SELECT ...
|
||||
FROM TABLE(S3(
|
||||
uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
|
||||
format => 'csv'))
|
||||
) (x VARCHAR, y VARCHAR)
|
||||
```
|
||||
|
||||
Additional variations, omitting the common bits:
|
||||
|
||||
```sql
|
||||
S3(prefixes => ARRAY['s3://foo/bar/', 's3://bar/foo/']))
|
||||
```
|
||||
|
||||
```sql
|
||||
-- Not an exact match for the JSON example: the S3 function allows
|
||||
-- only one bucket.
|
||||
S3(bucket => 's3://foo`,
|
||||
paths => ARRAY['bar/file1.json', 'foo/file2.json'])
|
||||
```
|
||||
|
||||
```sql
|
||||
S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
|
||||
accessKeyId => 'KLJ78979SDFdS2',
|
||||
secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd')
|
||||
```
|
||||
|
||||
```sql
|
||||
S3(uris => ARRAY['s3://foo/bar/file.json', 's3://bar/foo/file2.json'],
|
||||
accessKeyId => 'KLJ78979SDFdS2',
|
||||
secretAccessKey => 'KLS89s98sKJHKJKJH8721lljkd',
|
||||
assumeRoleArn => 'arn:aws:iam::2981002874992:role/role-s3')
|
||||
```
|
||||
|
||||
#### Input Format
|
||||
|
||||
Each of the table functions above requires that you specify a format using the `format`
|
||||
parameter which accepts a value the same as the format names used for `EXTERN` and described
|
||||
for [each input source](../ingestion/native-batch-input-source.md).
|
||||
|
||||
#### CSV Format
|
||||
|
||||
The `csv` format selects the [CSV input format](../ingestion/data-formats.md#csv).
|
||||
Parameters:
|
||||
|
||||
* `listDelimiter` (`VARCHAR`)
|
||||
* `skipHeaderRows` (`BOOLEAN`)
|
||||
|
||||
Example for a CSV format with a list delimiter and where we want to skip the first
|
||||
input row:
|
||||
|
||||
```sql
|
||||
SELECT ...
|
||||
FROM TABLE(
|
||||
inline(
|
||||
data => ARRAY[
|
||||
'skip me',
|
||||
'a;foo,b',
|
||||
'c;bar,d'],
|
||||
format => 'csv',
|
||||
listDelimiter => ';',
|
||||
skipHeaderRows => 1
|
||||
)
|
||||
) (x VARCHAR, y VARCHAR)
|
||||
```
|
||||
|
||||
#### Delimited Text Format
|
||||
|
||||
The `tsv` format selects the [TSV (Delimited) input format](../ingestion/data-formats.md#tsv-delimited).
|
||||
Parameters:
|
||||
|
||||
* `delimiter` (`VARCHAR`)
|
||||
* `listDelimiter` (`VARCHAR`)
|
||||
* `skipHeaderRows` (`BOOLEAN`)
|
||||
|
||||
Example for a pipe-separated format with a list delimiter and where we want to skip the first
|
||||
input row:
|
||||
|
||||
```sql
|
||||
SELECT ...
|
||||
FROM TABLE(
|
||||
inline(
|
||||
data => ARRAY[
|
||||
'skip me',
|
||||
'a;foo|b',
|
||||
'c;bar|d'],
|
||||
format => 'tsv',
|
||||
listDelimiter => ';',
|
||||
skipHeaderRows => 1,
|
||||
delimiter => '|'
|
||||
)
|
||||
) (x VARCHAR, y VARCHAR)
|
||||
```
|
||||
|
||||
#### JSON Format
|
||||
|
||||
The `json` format selects the
|
||||
[JSON input format](../ingestion/data-formats.html#json).
|
||||
The JSON format accepts no additional parameters.
|
||||
|
||||
Example:
|
||||
|
||||
```sql
|
||||
SELECT ...
|
||||
FROM TABLE(
|
||||
inline(
|
||||
data => ARRAY['{"x": "foo", "y": "bar"}'],
|
||||
format => 'json')
|
||||
) (x VARCHAR, y VARCHAR)
|
||||
```
|
||||
|
||||
The JSON function allows columns to be of type `TYPE('COMPLEX<json>')` which indicates that the column contains
|
||||
some form of complex JSON: a JSON object, a JSON array, or an array of JSON objects or arrays.
|
||||
Note that the case must exactly match that given: upper case `COMPLEX`, lower case `json`.
|
||||
The SQL type simply names a native Druid type. However, the actual
|
||||
segment column produced may be of some other type if Druid infers that it can use a simpler type
|
||||
instead.
|
||||
|
||||
### Parameters
|
||||
|
||||
Starting with the Druid 26.0 release, you can use query parameters with MSQ queries. You may find
|
||||
that you periodically ingest a new set of files into Druid. Often, the bulk of the query is identical
|
||||
for each ingestion: only the list of files (or URIs or objects) changes. For example, for the `S3`
|
||||
input source, you will likely ingest from the same bucket and security setup in
|
||||
each query; only the specific objects will change. Consider using a query parameter
|
||||
to pass the object names:
|
||||
|
||||
```sql
|
||||
INSERT INTO ...
|
||||
SELECT ...
|
||||
FROM TABLE(S3(bucket => 's3://foo`,
|
||||
accessKeyId => ?,
|
||||
paths => ?,
|
||||
format => JSON))
|
||||
(a VARCHAR, b BIGINT, ...)
|
||||
```
|
||||
|
||||
This same technique can be used with the `uris` or `prefixes` parameters instead.
|
||||
|
||||
Function arguments that take an array parameter require an array function in your JSON request.
|
||||
For example:
|
||||
|
||||
```json
|
||||
{
|
||||
"query" : "INSERT INTO ...
|
||||
SELECT ...
|
||||
FROM TABLE(S3(bucket => 's3://foo`,
|
||||
accessKeyId => ?,
|
||||
paths => ?,
|
||||
format => JSON))
|
||||
(a VARCHAR, b BIGINT, ...)",
|
||||
"parameters": [
|
||||
{ "type": "VARCHAR", "value": "ABCD-EF01"},
|
||||
{ "type": "VARCHAR", "value": [
|
||||
"foo.csv", "bar.csv"
|
||||
] }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The type in the above example is the type of each element. It must be `VARCHAR` for all the array
|
||||
parameters for functions described on this page.
|
||||
|
||||
### `INSERT`
|
||||
|
||||
Use the `INSERT` statement to insert data.
|
||||
|
|
|
@ -32,9 +32,6 @@ import org.apache.druid.msq.sql.MSQTaskSqlEngine;
|
|||
import org.apache.druid.sql.SqlStatementFactory;
|
||||
import org.apache.druid.sql.SqlToolbox;
|
||||
import org.apache.druid.sql.calcite.external.ExternalOperatorConversion;
|
||||
import org.apache.druid.sql.calcite.external.HttpOperatorConversion;
|
||||
import org.apache.druid.sql.calcite.external.InlineOperatorConversion;
|
||||
import org.apache.druid.sql.calcite.external.LocalOperatorConversion;
|
||||
import org.apache.druid.sql.guice.SqlBindings;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -62,9 +59,6 @@ public class MSQSqlModule implements DruidModule
|
|||
|
||||
// Set up the EXTERN macro.
|
||||
SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class);
|
||||
SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class);
|
||||
SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class);
|
||||
SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);
|
||||
}
|
||||
|
||||
@Provides
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.druid.testing.utils.MsqTestQueryHelper;
|
|||
import org.apache.druid.testsEx.categories.MultiStageQuery;
|
||||
import org.apache.druid.testsEx.config.DruidTestRunner;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.runner.RunWith;
|
||||
|
@ -111,6 +112,7 @@ public class ITMultiStageQuery
|
|||
}
|
||||
|
||||
@Test
|
||||
@Ignore("localfiles() is disabled")
|
||||
public void testMsqIngestionAndQueryingWithLocalFn() throws Exception
|
||||
{
|
||||
String datasource = "dst";
|
||||
|
|
|
@ -154,6 +154,9 @@ public class CalciteIngestionDmlTest extends BaseCalciteQueryTest
|
|||
|
||||
// Set up the EXTERN macro.
|
||||
SqlBindings.addOperatorConversion(binder, ExternalOperatorConversion.class);
|
||||
|
||||
// Enable the extended table functions for testing even though these
|
||||
// are not enabled in production in Druid 26.
|
||||
SqlBindings.addOperatorConversion(binder, HttpOperatorConversion.class);
|
||||
SqlBindings.addOperatorConversion(binder, InlineOperatorConversion.class);
|
||||
SqlBindings.addOperatorConversion(binder, LocalOperatorConversion.class);
|
||||
|
|
Loading…
Reference in New Issue