use objectGlob (#16452)

Catching up to a change introduced in #13027
This commit is contained in:
Vadim Ogievetsky 2024-05-15 02:41:11 -07:00 committed by GitHub
parent ddfd62d9a9
commit c419ae5f73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 64 additions and 36 deletions

View File

@ -56,7 +56,11 @@ import { summarizeIndexSpec } from '../index-spec/index-spec';
import type { InputFormat } from '../input-format/input-format'; import type { InputFormat } from '../input-format/input-format';
import { issueWithInputFormat } from '../input-format/input-format'; import { issueWithInputFormat } from '../input-format/input-format';
import type { InputSource } from '../input-source/input-source'; import type { InputSource } from '../input-source/input-source';
import { FILTER_SUGGESTIONS, issueWithInputSource } from '../input-source/input-source'; import {
FILTER_SUGGESTIONS,
issueWithInputSource,
OBJECT_GLOB_SUGGESTIONS,
} from '../input-source/input-source';
import type { MetricSpec } from '../metric-spec/metric-spec'; import type { MetricSpec } from '../metric-spec/metric-spec';
import { import {
getMetricSpecOutputType, getMetricSpecOutputType,
@ -584,21 +588,29 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
), ),
}; };
const inputSourceFilter: Field<IoConfig> = { const inputSourceObjectGlob: Field<IoConfig> = {
name: 'inputSource.filter', name: 'inputSource.objectGlob',
label: 'File filter', label: 'Object glob',
type: 'string', type: 'string',
suggestions: FILTER_SUGGESTIONS, suggestions: OBJECT_GLOB_SUGGESTIONS,
placeholder: '*', placeholder: '(all files)',
info: ( info: (
<p> <>
A wildcard filter for files. See{' '} <p>A glob for the object part of the URI.</p>
<ExternalLink href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/filefilter/WildcardFileFilter.html"> <p>
here The glob must match the entire object part, not just the filename. For example, the glob
</ExternalLink>{' '} <Code>*.json</Code> does not match <Code>/bar/file.json</Code>, because and the{' '}
for format information. Files matching the filter criteria are considered for ingestion. <Code>*</Code> does not match the slash. To match all objects ending in <Code>.json</Code>
Files not matching the filter criteria are ignored. , use <Code>**.json</Code> instead.
</p> </p>
<p>
For more information, refer to the documentation for{' '}
<ExternalLink href="https://docs.oracle.com/javase/8/docs/api/java/nio/file/FileSystem.html#getPathMatcher-java.lang.String-">
FileSystem#getPathMatcher
</ExternalLink>
.
</p>
</>
), ),
}; };
@ -781,7 +793,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
</> </>
), ),
}, },
inputSourceFilter, inputSourceObjectGlob,
{ {
name: 'inputSource.properties.accessKeyId.type', name: 'inputSource.properties.accessKeyId.type',
label: 'Access key ID type', label: 'Access key ID type',
@ -944,7 +956,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
</> </>
), ),
}, },
inputSourceFilter, inputSourceObjectGlob,
{ {
name: 'inputSource.properties.sharedAccessStorageToken', name: 'inputSource.properties.sharedAccessStorageToken',
label: 'Shared Access Storage Token', label: 'Shared Access Storage Token',
@ -1018,7 +1030,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
</> </>
), ),
}, },
inputSourceFilter, inputSourceObjectGlob,
]; ];
case 'index_parallel:delta': case 'index_parallel:delta':

View File

@ -16,6 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
import { Code } from '@blueprintjs/core';
import React from 'react'; import React from 'react';
import type { Field } from '../../components'; import type { Field } from '../../components';
@ -36,6 +37,18 @@ export const FILTER_SUGGESTIONS: string[] = [
'*.avro', '*.avro',
]; ];
export const OBJECT_GLOB_SUGGESTIONS: string[] = [
'**.jsonl',
'**.jsonl.gz',
'**.json',
'**.json.gz',
'**.csv',
'**.tsv',
'**.parquet',
'**.orc',
'**.avro',
];
export interface InputSource { export interface InputSource {
type: string; type: string;
baseDir?: string; baseDir?: string;
@ -43,6 +56,7 @@ export interface InputSource {
uris?: string[]; uris?: string[];
prefixes?: string[]; prefixes?: string[];
objects?: { bucket: string; path: string }[]; objects?: { bucket: string; path: string }[];
objectGlob?: string;
fetchTimeout?: number; fetchTimeout?: number;
systemFields?: string[]; systemFields?: string[];
@ -94,10 +108,11 @@ export type InputSourceDesc =
httpAuthenticationPassword?: any; httpAuthenticationPassword?: any;
} }
| { | {
type: 's3'; type: 's3' | 'google' | 'azureStorage';
uris?: string[]; uris?: string[];
prefixes?: string[]; prefixes?: string[];
objects?: { bucket: string; path: string }[]; objects?: { bucket: string; path: string }[];
objectGlob?: string;
properties?: { properties?: {
accessKeyId?: any; accessKeyId?: any;
secretAccessKey?: any; secretAccessKey?: any;
@ -105,12 +120,6 @@ export type InputSourceDesc =
assumeRoleExternalId?: any; assumeRoleExternalId?: any;
}; };
} }
| {
type: 'google' | 'azureStorage';
uris?: string[];
prefixes?: string[];
objects?: { bucket: string; path: string }[];
}
| { | {
type: 'hdfs'; type: 'hdfs';
paths?: string | string[]; paths?: string | string[];
@ -483,21 +492,28 @@ export const INPUT_SOURCE_FIELDS: Field<InputSource>[] = [
// Cloud common // Cloud common
{ {
name: 'filter', name: 'objectGlob',
label: 'File filter',
type: 'string', type: 'string',
suggestions: FILTER_SUGGESTIONS, suggestions: OBJECT_GLOB_SUGGESTIONS,
placeholder: '*', placeholder: '(all files)',
defined: typeIsKnown(KNOWN_TYPES, 's3', 'azureStorage', 'google'), defined: typeIsKnown(KNOWN_TYPES, 's3', 'azureStorage', 'google'),
info: ( info: (
<p> <>
A wildcard filter for files. See{' '} <p>A glob for the object part of the URI.</p>
<ExternalLink href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/filefilter/WildcardFileFilter.html"> <p>
here The glob must match the entire object part, not just the filename. For example, the glob
</ExternalLink>{' '} <Code>*.json</Code> does not match <Code>/bar/file.json</Code>, because and the{' '}
for format information. Files matching the filter criteria are considered for ingestion. <Code>*</Code> does not match the slash. To match all objects ending in <Code>.json</Code>
Files not matching the filter criteria are ignored. , use <Code>**.json</Code> instead.
</p> </p>
<p>
For more information, refer to the documentation for{' '}
<ExternalLink href="https://docs.oracle.com/javase/8/docs/api/java/nio/file/FileSystem.html#getPathMatcher-java.lang.String-">
FileSystem#getPathMatcher
</ExternalLink>
.
</p>
</>
), ),
}, },