use objectGlob (#16452)

Catching up to a change introduced in #13027
This commit is contained in:
Vadim Ogievetsky 2024-05-15 02:41:11 -07:00 committed by GitHub
parent ddfd62d9a9
commit c419ae5f73
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 64 additions and 36 deletions

View File

@ -56,7 +56,11 @@ import { summarizeIndexSpec } from '../index-spec/index-spec';
import type { InputFormat } from '../input-format/input-format';
import { issueWithInputFormat } from '../input-format/input-format';
import type { InputSource } from '../input-source/input-source';
import { FILTER_SUGGESTIONS, issueWithInputSource } from '../input-source/input-source';
import {
FILTER_SUGGESTIONS,
issueWithInputSource,
OBJECT_GLOB_SUGGESTIONS,
} from '../input-source/input-source';
import type { MetricSpec } from '../metric-spec/metric-spec';
import {
getMetricSpecOutputType,
@ -584,21 +588,29 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
),
};
const inputSourceFilter: Field<IoConfig> = {
name: 'inputSource.filter',
label: 'File filter',
const inputSourceObjectGlob: Field<IoConfig> = {
name: 'inputSource.objectGlob',
label: 'Object glob',
type: 'string',
suggestions: FILTER_SUGGESTIONS,
placeholder: '*',
suggestions: OBJECT_GLOB_SUGGESTIONS,
placeholder: '(all files)',
info: (
<>
<p>A glob for the object part of the URI.</p>
<p>
A wildcard filter for files. See{' '}
<ExternalLink href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/filefilter/WildcardFileFilter.html">
here
</ExternalLink>{' '}
for format information. Files matching the filter criteria are considered for ingestion.
Files not matching the filter criteria are ignored.
The glob must match the entire object part, not just the filename. For example, the glob
<Code>*.json</Code> does not match <Code>/bar/file.json</Code>, because and the{' '}
<Code>*</Code> does not match the slash. To match all objects ending in <Code>.json</Code>
, use <Code>**.json</Code> instead.
</p>
<p>
For more information, refer to the documentation for{' '}
<ExternalLink href="https://docs.oracle.com/javase/8/docs/api/java/nio/file/FileSystem.html#getPathMatcher-java.lang.String-">
FileSystem#getPathMatcher
</ExternalLink>
.
</p>
</>
),
};
@ -781,7 +793,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
</>
),
},
inputSourceFilter,
inputSourceObjectGlob,
{
name: 'inputSource.properties.accessKeyId.type',
label: 'Access key ID type',
@ -944,7 +956,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
</>
),
},
inputSourceFilter,
inputSourceObjectGlob,
{
name: 'inputSource.properties.sharedAccessStorageToken',
label: 'Shared Access Storage Token',
@ -1018,7 +1030,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
</>
),
},
inputSourceFilter,
inputSourceObjectGlob,
];
case 'index_parallel:delta':

View File

@ -16,6 +16,7 @@
* limitations under the License.
*/
import { Code } from '@blueprintjs/core';
import React from 'react';
import type { Field } from '../../components';
@ -36,6 +37,18 @@ export const FILTER_SUGGESTIONS: string[] = [
'*.avro',
];
export const OBJECT_GLOB_SUGGESTIONS: string[] = [
'**.jsonl',
'**.jsonl.gz',
'**.json',
'**.json.gz',
'**.csv',
'**.tsv',
'**.parquet',
'**.orc',
'**.avro',
];
export interface InputSource {
type: string;
baseDir?: string;
@ -43,6 +56,7 @@ export interface InputSource {
uris?: string[];
prefixes?: string[];
objects?: { bucket: string; path: string }[];
objectGlob?: string;
fetchTimeout?: number;
systemFields?: string[];
@ -94,10 +108,11 @@ export type InputSourceDesc =
httpAuthenticationPassword?: any;
}
| {
type: 's3';
type: 's3' | 'google' | 'azureStorage';
uris?: string[];
prefixes?: string[];
objects?: { bucket: string; path: string }[];
objectGlob?: string;
properties?: {
accessKeyId?: any;
secretAccessKey?: any;
@ -105,12 +120,6 @@ export type InputSourceDesc =
assumeRoleExternalId?: any;
};
}
| {
type: 'google' | 'azureStorage';
uris?: string[];
prefixes?: string[];
objects?: { bucket: string; path: string }[];
}
| {
type: 'hdfs';
paths?: string | string[];
@ -483,21 +492,28 @@ export const INPUT_SOURCE_FIELDS: Field<InputSource>[] = [
// Cloud common
{
name: 'filter',
label: 'File filter',
name: 'objectGlob',
type: 'string',
suggestions: FILTER_SUGGESTIONS,
placeholder: '*',
suggestions: OBJECT_GLOB_SUGGESTIONS,
placeholder: '(all files)',
defined: typeIsKnown(KNOWN_TYPES, 's3', 'azureStorage', 'google'),
info: (
<>
<p>A glob for the object part of the URI.</p>
<p>
A wildcard filter for files. See{' '}
<ExternalLink href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/filefilter/WildcardFileFilter.html">
here
</ExternalLink>{' '}
for format information. Files matching the filter criteria are considered for ingestion.
Files not matching the filter criteria are ignored.
The glob must match the entire object part, not just the filename. For example, the glob
<Code>*.json</Code> does not match <Code>/bar/file.json</Code>, because and the{' '}
<Code>*</Code> does not match the slash. To match all objects ending in <Code>.json</Code>
, use <Code>**.json</Code> instead.
</p>
<p>
For more information, refer to the documentation for{' '}
<ExternalLink href="https://docs.oracle.com/javase/8/docs/api/java/nio/file/FileSystem.html#getPathMatcher-java.lang.String-">
FileSystem#getPathMatcher
</ExternalLink>
.
</p>
</>
),
},