Web console: Fixed sampling for delta source in classic data loader and MSQ (#17160) (#17199)

This commit is contained in:
Abhishek Radhakrishnan 2024-09-30 10:25:38 -07:00 committed by GitHub
parent 7638d29c40
commit c6f41dcd22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 152 additions and 137 deletions

View File

@ -56,7 +56,7 @@ function _build_distribution() {
( (
# Add HEAD as an allowed HTTP method since this is how we check when the Druid service is ready. # Add HEAD as an allowed HTTP method since this is how we check when the Druid service is ready.
cd "$(_get_code_root)" \ cd "$(_get_code_root)" \
&& mvn -Pdist,skip-static-checks,skip-tests -Dmaven.javadoc.skip=true -q -T1C install \ && mvn -Pdist,bundle-contrib-exts,skip-static-checks,skip-tests -Dforbiddenapis.skip=true -Dcheckstyle.skip=true -Dpmd.skip=true -Dmaven.javadoc.skip=true -Danimal.sniffer.skip=true -Denforcer.skip=true -Dcyclonedx.skip=true -q -T1C install \
&& cd distribution/target \ && cd distribution/target \
&& tar xzf "apache-druid-$(_get_druid_version)-bin.tar.gz" \ && tar xzf "apache-druid-$(_get_druid_version)-bin.tar.gz" \
&& cd apache-druid-$(_get_druid_version) \ && cd apache-druid-$(_get_druid_version) \
@ -64,7 +64,7 @@ function _build_distribution() {
&& cp "$(_get_code_root)/extensions-core/testing-tools/target/druid-testing-tools-$(_get_druid_version).jar" extensions/druid-testing-tools/ \ && cp "$(_get_code_root)/extensions-core/testing-tools/target/druid-testing-tools-$(_get_druid_version).jar" extensions/druid-testing-tools/ \
&& mkdir -p extensions/druid-compressed-bigdecimal \ && mkdir -p extensions/druid-compressed-bigdecimal \
&& cp "$(_get_code_root)/extensions-contrib/compressed-bigdecimal/target/druid-compressed-bigdecimal-$(_get_druid_version).jar" extensions/druid-compressed-bigdecimal/ \ && cp "$(_get_code_root)/extensions-contrib/compressed-bigdecimal/target/druid-compressed-bigdecimal-$(_get_druid_version).jar" extensions/druid-compressed-bigdecimal/ \
&& echo -e "\n\ndruid.extensions.loadList=[\"druid-hdfs-storage\", \"druid-kafka-indexing-service\", \"druid-multi-stage-query\", \"druid-testing-tools\", \"druid-bloom-filter\", \"druid-datasketches\", \"druid-histogram\", \"druid-stats\", \"druid-compressed-bigdecimal\"]" >> conf/druid/auto/_common/common.runtime.properties \ && echo -e "\n\ndruid.extensions.loadList=[\"druid-hdfs-storage\", \"druid-kafka-indexing-service\", \"druid-multi-stage-query\", \"druid-testing-tools\", \"druid-bloom-filter\", \"druid-datasketches\", \"druid-histogram\", \"druid-stats\", \"druid-compressed-bigdecimal\", \"druid-parquet-extensions\", \"druid-deltalake-extensions\"]" >> conf/druid/auto/_common/common.runtime.properties \
&& echo -e "\n\ndruid.server.http.allowedHttpMethods=[\"HEAD\"]" >> conf/druid/auto/_common/common.runtime.properties \ && echo -e "\n\ndruid.server.http.allowedHttpMethods=[\"HEAD\"]" >> conf/druid/auto/_common/common.runtime.properties \
&& echo -e "\n\ndruid.export.storage.baseDir=/" >> conf/druid/auto/_common/common.runtime.properties \ && echo -e "\n\ndruid.export.storage.baseDir=/" >> conf/druid/auto/_common/common.runtime.properties \
) )

View File

@ -25,9 +25,8 @@ import AceEditor from 'react-ace';
import './json-input.scss'; import './json-input.scss';
function parseHjson(str: string) { function parseHjson(str: string): any {
// Throwing on empty input is more consistent with how JSON.parse works if (str.trim() === '') return;
if (str.trim() === '') throw new Error('empty hjson');
return Hjson.parse(str); return Hjson.parse(str);
} }

View File

@ -395,6 +395,10 @@ export function isDruidSource(spec: Partial<IngestionSpec>): boolean {
return deepGet(spec, 'spec.ioConfig.inputSource.type') === 'druid'; return deepGet(spec, 'spec.ioConfig.inputSource.type') === 'druid';
} }
export function isFixedFormatSource(spec: Partial<IngestionSpec>): boolean {
return oneOf(deepGet(spec, 'spec.ioConfig.inputSource.type'), 'druid', 'delta');
}
export function getPossibleSystemFieldsForSpec(spec: Partial<IngestionSpec>): string[] { export function getPossibleSystemFieldsForSpec(spec: Partial<IngestionSpec>): string[] {
const inputSource = deepGet(spec, 'spec.ioConfig.inputSource'); const inputSource = deepGet(spec, 'spec.ioConfig.inputSource');
if (!inputSource) return []; if (!inputSource) return [];
@ -1061,7 +1065,6 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
label: 'Delta filter', label: 'Delta filter',
type: 'json', type: 'json',
placeholder: '{"type": "=", "column": "name", "value": "foo"}', placeholder: '{"type": "=", "column": "name", "value": "foo"}',
defaultValue: {},
info: ( info: (
<> <>
<ExternalLink <ExternalLink
@ -1078,7 +1081,7 @@ export function getIoConfigFormFields(ingestionComboType: IngestionComboType): F
label: 'Delta snapshot version', label: 'Delta snapshot version',
type: 'number', type: 'number',
placeholder: '(latest)', placeholder: '(latest)',
defaultValue: {}, zeroMeansUndefined: true,
info: ( info: (
<> <>
The snapshot version to read from the Delta table. By default, the latest snapshot is The snapshot version to read from the Delta table. By default, the latest snapshot is
@ -1613,6 +1616,9 @@ export function guessDataSourceNameFromInputSource(inputSource: InputSource): st
return actualPath ? actualPath.path : uriPath ? filenameFromPath(uriPath) : undefined; return actualPath ? actualPath.path : uriPath ? filenameFromPath(uriPath) : undefined;
} }
case 'delta':
return inputSource.tablePath ? filenameFromPath(inputSource.tablePath) : undefined;
case 'http': case 'http':
return Array.isArray(inputSource.uris) ? filenameFromPath(inputSource.uris[0]) : undefined; return Array.isArray(inputSource.uris) ? filenameFromPath(inputSource.uris[0]) : undefined;

View File

@ -653,7 +653,6 @@ export const INPUT_SOURCE_FIELDS: Field<InputSource>[] = [
type: 'json', type: 'json',
placeholder: '{"type": "=", "column": "name", "value": "foo"}', placeholder: '{"type": "=", "column": "name", "value": "foo"}',
defined: typeIsKnown(KNOWN_TYPES, 'delta'), defined: typeIsKnown(KNOWN_TYPES, 'delta'),
required: false,
info: ( info: (
<> <>
<ExternalLink href={`${getLink('DOCS')}/ingestion/input-sources/#delta-filter-object`}> <ExternalLink href={`${getLink('DOCS')}/ingestion/input-sources/#delta-filter-object`}>
@ -668,8 +667,8 @@ export const INPUT_SOURCE_FIELDS: Field<InputSource>[] = [
label: 'Delta snapshot version', label: 'Delta snapshot version',
type: 'number', type: 'number',
placeholder: '(latest)', placeholder: '(latest)',
zeroMeansUndefined: true,
defined: typeIsKnown(KNOWN_TYPES, 'delta'), defined: typeIsKnown(KNOWN_TYPES, 'delta'),
required: false,
info: ( info: (
<> <>
The snapshot version to read from the Delta table. By default, the latest snapshot is read. The snapshot version to read from the Delta table. By default, the latest snapshot is read.

View File

@ -17,6 +17,7 @@
*/ */
import { dedupe, F, SqlExpression, SqlFunction } from '@druid-toolkit/query'; import { dedupe, F, SqlExpression, SqlFunction } from '@druid-toolkit/query';
import type { CancelToken } from 'axios';
import * as JSONBig from 'json-bigint-native'; import * as JSONBig from 'json-bigint-native';
import type { import type {
@ -40,6 +41,7 @@ import {
getSpecType, getSpecType,
getTimestampSchema, getTimestampSchema,
isDruidSource, isDruidSource,
isFixedFormatSource,
PLACEHOLDER_TIMESTAMP_SPEC, PLACEHOLDER_TIMESTAMP_SPEC,
REINDEX_TIMESTAMP_SPEC, REINDEX_TIMESTAMP_SPEC,
TIME_COLUMN, TIME_COLUMN,
@ -187,12 +189,15 @@ export async function getProxyOverlordModules(): Promise<string[]> {
export async function postToSampler( export async function postToSampler(
sampleSpec: SampleSpec, sampleSpec: SampleSpec,
forStr: string, forStr: string,
cancelToken?: CancelToken,
): Promise<SampleResponse> { ): Promise<SampleResponse> {
sampleSpec = fixSamplerLookups(fixSamplerTypes(sampleSpec)); sampleSpec = fixSamplerLookups(fixSamplerTypes(sampleSpec));
let sampleResp: any; let sampleResp: any;
try { try {
sampleResp = await Api.instance.post(`/druid/indexer/v1/sampler?for=${forStr}`, sampleSpec); sampleResp = await Api.instance.post(`/druid/indexer/v1/sampler?for=${forStr}`, sampleSpec, {
cancelToken,
});
} catch (e) { } catch (e) {
throw new Error(getDruidErrorMessage(e)); throw new Error(getDruidErrorMessage(e));
} }
@ -269,8 +274,7 @@ export async function sampleForConnect(
sampleStrategy, sampleStrategy,
); );
const reingestMode = isDruidSource(spec); if (!isFixedFormatSource(spec)) {
if (!reingestMode) {
ioConfig = deepSet( ioConfig = deepSet(
ioConfig, ioConfig,
'inputFormat', 'inputFormat',
@ -282,6 +286,7 @@ export async function sampleForConnect(
); );
} }
const reingestMode = isDruidSource(spec);
const sampleSpec: SampleSpec = { const sampleSpec: SampleSpec = {
type: samplerType, type: samplerType,
spec: { spec: {
@ -290,7 +295,7 @@ export async function sampleForConnect(
dataSchema: { dataSchema: {
dataSource: 'sample', dataSource: 'sample',
timestampSpec: reingestMode ? REINDEX_TIMESTAMP_SPEC : PLACEHOLDER_TIMESTAMP_SPEC, timestampSpec: reingestMode ? REINDEX_TIMESTAMP_SPEC : PLACEHOLDER_TIMESTAMP_SPEC,
dimensionsSpec: {}, dimensionsSpec: { useSchemaDiscovery: true },
granularitySpec: { granularitySpec: {
rollup: false, rollup: false,
}, },

View File

@ -115,6 +115,7 @@ import {
invalidPartitionConfig, invalidPartitionConfig,
isDruidSource, isDruidSource,
isEmptyIngestionSpec, isEmptyIngestionSpec,
isFixedFormatSource,
isKafkaOrKinesis, isKafkaOrKinesis,
isStreamingSpec, isStreamingSpec,
issueWithIoConfig, issueWithIoConfig,
@ -265,26 +266,27 @@ function showBlankLine(line: SampleEntry): string {
function formatSampleEntries( function formatSampleEntries(
sampleEntries: SampleEntry[], sampleEntries: SampleEntry[],
specialSource: undefined | 'druid' | 'kafka' | 'kinesis', specialSource: undefined | 'fixedFormat' | 'druid' | 'kafka' | 'kinesis',
): string { ): string[] {
if (!sampleEntries.length) return 'No data returned from sampler'; if (!sampleEntries.length) return ['No data returned from sampler'];
switch (specialSource) { switch (specialSource) {
case 'fixedFormat':
return sampleEntries.map(l => JSONBig.stringify(l.parsed));
case 'druid': case 'druid':
return sampleEntries.map(showDruidLine).join('\n'); return sampleEntries.map(showDruidLine);
case 'kafka': case 'kafka':
return sampleEntries.map(showKafkaLine).join('\n'); return sampleEntries.map(showKafkaLine);
case 'kinesis': case 'kinesis':
return sampleEntries.map(showKinesisLine).join('\n'); return sampleEntries.map(showKinesisLine);
default: default:
return ( return sampleEntries.every(l => !l.parsed)
sampleEntries.every(l => !l.parsed)
? sampleEntries.map(showBlankLine) ? sampleEntries.map(showBlankLine)
: sampleEntries.map(showRawLine) : sampleEntries.map(showRawLine);
).join('\n');
} }
} }
@ -551,7 +553,6 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
isStepEnabled(step: Step): boolean { isStepEnabled(step: Step): boolean {
const { spec, cacheRows } = this.state; const { spec, cacheRows } = this.state;
const druidSource = isDruidSource(spec);
const ioConfig: IoConfig = deepGet(spec, 'spec.ioConfig') || EMPTY_OBJECT; const ioConfig: IoConfig = deepGet(spec, 'spec.ioConfig') || EMPTY_OBJECT;
switch (step) { switch (step) {
@ -559,10 +560,12 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
return Boolean(spec.type); return Boolean(spec.type);
case 'parser': case 'parser':
return Boolean(!druidSource && spec.type && !issueWithIoConfig(ioConfig)); return Boolean(!isFixedFormatSource(spec) && spec.type && !issueWithIoConfig(ioConfig));
case 'timestamp': case 'timestamp':
return Boolean(!druidSource && cacheRows && deepGet(spec, 'spec.dataSchema.timestampSpec')); return Boolean(
!isDruidSource(spec) && cacheRows && deepGet(spec, 'spec.dataSchema.timestampSpec'),
);
case 'transform': case 'transform':
case 'filter': case 'filter':
@ -1256,7 +1259,7 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
const deltaState: Partial<LoadDataViewState> = { const deltaState: Partial<LoadDataViewState> = {
inputQueryState: new QueryState({ data: sampleResponse }), inputQueryState: new QueryState({ data: sampleResponse }),
}; };
if (isDruidSource(spec)) { if (isFixedFormatSource(spec)) {
deltaState.cacheRows = getCacheRowsFromSampleResponse(sampleResponse); deltaState.cacheRows = getCacheRowsFromSampleResponse(sampleResponse);
} }
this.setState(deltaState as LoadDataViewState); this.setState(deltaState as LoadDataViewState);
@ -1268,8 +1271,15 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
const specType = getSpecType(spec); const specType = getSpecType(spec);
const ioConfig: IoConfig = deepGet(spec, 'spec.ioConfig') || EMPTY_OBJECT; const ioConfig: IoConfig = deepGet(spec, 'spec.ioConfig') || EMPTY_OBJECT;
const inlineMode = deepGet(spec, 'spec.ioConfig.inputSource.type') === 'inline'; const inlineMode = deepGet(spec, 'spec.ioConfig.inputSource.type') === 'inline';
const fixedFormatSource = isFixedFormatSource(spec);
const druidSource = isDruidSource(spec); const druidSource = isDruidSource(spec);
const specialSource = druidSource ? 'druid' : isKafkaOrKinesis(specType) ? specType : undefined; const specialSource = druidSource
? 'druid'
: fixedFormatSource
? 'fixedFormat'
: isKafkaOrKinesis(specType)
? specType
: undefined;
let mainFill: JSX.Element | string; let mainFill: JSX.Element | string;
if (inlineMode) { if (inlineMode) {
@ -1301,7 +1311,7 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
<TextArea <TextArea
className="raw-lines" className="raw-lines"
readOnly readOnly
value={formatSampleEntries(inputData, specialSource)} value={formatSampleEntries(inputData, specialSource).join('\n')}
/> />
)} )}
{inputQueryState.isLoading() && <Loader />} {inputQueryState.isLoading() && <Loader />}
@ -1373,7 +1383,7 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
</div> </div>
{this.renderNextBar({ {this.renderNextBar({
disabled: !inputQueryState.data, disabled: !inputQueryState.data,
nextStep: druidSource ? 'transform' : 'parser', nextStep: druidSource ? 'transform' : fixedFormatSource ? 'timestamp' : 'parser',
onNextStep: () => { onNextStep: () => {
if (!inputQueryState.data) return false; if (!inputQueryState.data) return false;
const inputData = inputQueryState.data; const inputData = inputQueryState.data;
@ -1421,6 +1431,15 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
} }
} }
this.updateSpec(fillDataSourceNameIfNeeded(newSpec));
}
if (fixedFormatSource) {
const newSpec = deepSet(
spec,
'spec.dataSchema.timestampSpec',
getTimestampSpec(inputQueryState.data),
);
this.updateSpec(fillDataSourceNameIfNeeded(newSpec)); this.updateSpec(fillDataSourceNameIfNeeded(newSpec));
} else { } else {
const issue = issueWithSampleData( const issue = issueWithSampleData(
@ -1673,21 +1692,15 @@ export class LoadDataView extends React.PureComponent<LoadDataViewProps, LoadDat
disabled: !parserQueryState.data, disabled: !parserQueryState.data,
onNextStep: () => { onNextStep: () => {
if (!parserQueryState.data) return false; if (!parserQueryState.data) return false;
let possibleTimestampSpec: TimestampSpec; const possibleTimestampSpec = isDruidSource(spec)
if (isDruidSource(spec)) { ? {
possibleTimestampSpec = {
column: TIME_COLUMN, column: TIME_COLUMN,
format: 'auto', format: 'auto',
};
} else {
possibleTimestampSpec = getTimestampSpec(parserQueryState.data);
} }
: getTimestampSpec(parserQueryState.data);
if (possibleTimestampSpec) {
const newSpec = deepSet(spec, 'spec.dataSchema.timestampSpec', possibleTimestampSpec); const newSpec = deepSet(spec, 'spec.dataSchema.timestampSpec', possibleTimestampSpec);
this.updateSpec(newSpec); this.updateSpec(newSpec);
}
return true; return true;
}, },
})} })}

View File

@ -256,7 +256,6 @@ export const SqlDataLoaderView = React.memo(function SqlDataLoaderView(
<TitleFrame title="Load data" subtitle="Select input type"> <TitleFrame title="Load data" subtitle="Select input type">
<InputSourceStep <InputSourceStep
initInputSource={inputSource} initInputSource={inputSource}
mode="sampler"
onSet={(inputSource, inputFormat) => { onSet={(inputSource, inputFormat) => {
setExternalConfigStep({ inputSource, inputFormat }); setExternalConfigStep({ inputSource, inputFormat });
}} }}

View File

@ -83,7 +83,6 @@ export const ConnectExternalDataDialog = React.memo(function ConnectExternalData
) : ( ) : (
<InputSourceStep <InputSourceStep
initInputSource={inputSource} initInputSource={inputSource}
mode="sampler"
onSet={(inputSource, inputFormat, partitionedByHint) => { onSet={(inputSource, inputFormat, partitionedByHint) => {
setExternalConfigStep({ inputSource, inputFormat, partitionedByHint }); setExternalConfigStep({ inputSource, inputFormat, partitionedByHint });
}} }}

View File

@ -98,7 +98,9 @@ export const InputFormatStep = React.memo(function InputFormatStep(props: InputF
const [previewState] = useQueryManager<InputSourceAndFormat, SampleResponse>({ const [previewState] = useQueryManager<InputSourceAndFormat, SampleResponse>({
query: inputSourceAndFormatToSample, query: inputSourceAndFormatToSample,
processQuery: async ({ inputSource, inputFormat }) => { processQuery: async ({ inputSource, inputFormat }) => {
if (!isValidInputFormat(inputFormat)) throw new Error('invalid input format'); const fixedFormatSource = inputSource.type === 'delta';
if (!fixedFormatSource && !isValidInputFormat(inputFormat))
throw new Error('invalid input format');
const sampleSpec: SampleSpec = { const sampleSpec: SampleSpec = {
type: 'index_parallel', type: 'index_parallel',
@ -106,7 +108,9 @@ export const InputFormatStep = React.memo(function InputFormatStep(props: InputF
ioConfig: { ioConfig: {
type: 'index_parallel', type: 'index_parallel',
inputSource, inputSource,
inputFormat: deepSet(inputFormat, 'keepNullColumns', true), inputFormat: fixedFormatSource
? undefined
: (deepSet(inputFormat, 'keepNullColumns', true) as InputFormat),
}, },
dataSchema: { dataSchema: {
dataSource: 'sample', dataSource: 'sample',
@ -196,6 +200,8 @@ export const InputFormatStep = React.memo(function InputFormatStep(props: InputF
const needsResample = inputSourceAndFormatToSample !== inputSourceAndFormat; const needsResample = inputSourceAndFormatToSample !== inputSourceAndFormat;
const nextDisabled = !inputSourceFormatAndMore || needsResample; const nextDisabled = !inputSourceFormatAndMore || needsResample;
const fixedFormatSource = inputSourceFormatAndMore?.inputSource.type === 'delta';
return ( return (
<div className="input-format-step"> <div className="input-format-step">
<div className="preview"> <div className="preview">
@ -227,6 +233,15 @@ export const InputFormatStep = React.memo(function InputFormatStep(props: InputF
<LearnMore href={`${getLink('DOCS')}/ingestion/data-formats`} /> <LearnMore href={`${getLink('DOCS')}/ingestion/data-formats`} />
</Callout> </Callout>
</FormGroup> </FormGroup>
{fixedFormatSource ? (
<FormGroup>
<Callout>
The <Tag minimal>{inputSourceFormatAndMore?.inputSource.type}</Tag> input source has
a fixed format that can not be configured.
</Callout>
</FormGroup>
) : (
<>
<AutoForm <AutoForm
fields={BATCH_INPUT_FORMAT_FIELDS} fields={BATCH_INPUT_FORMAT_FIELDS}
model={inputSourceAndFormat.inputFormat} model={inputSourceAndFormat.inputFormat}
@ -249,6 +264,8 @@ export const InputFormatStep = React.memo(function InputFormatStep(props: InputF
onChange={setInputSourceAndFormat as any} onChange={setInputSourceAndFormat as any}
/> />
)} )}
</>
)}
{needsResample && ( {needsResample && (
<FormGroup className="control-buttons"> <FormGroup className="control-buttons">
<Button <Button

View File

@ -28,7 +28,6 @@ import {
} from '@blueprintjs/core'; } from '@blueprintjs/core';
import { IconNames } from '@blueprintjs/icons'; import { IconNames } from '@blueprintjs/icons';
import type { QueryResult } from '@druid-toolkit/query'; import type { QueryResult } from '@druid-toolkit/query';
import { SqlColumnDeclaration } from '@druid-toolkit/query';
import classNames from 'classnames'; import classNames from 'classnames';
import type { JSX } from 'react'; import type { JSX } from 'react';
import React, { useEffect, useState } from 'react'; import React, { useEffect, useState } from 'react';
@ -37,7 +36,6 @@ import { AutoForm, ExternalLink } from '../../../components';
import { ShowValueDialog } from '../../../dialogs/show-value-dialog/show-value-dialog'; import { ShowValueDialog } from '../../../dialogs/show-value-dialog/show-value-dialog';
import type { Execution, ExecutionError, InputFormat, InputSource } from '../../../druid-models'; import type { Execution, ExecutionError, InputFormat, InputSource } from '../../../druid-models';
import { import {
externalConfigToTableExpression,
getIngestionImage, getIngestionImage,
getIngestionTitle, getIngestionTitle,
guessSimpleInputFormat, guessSimpleInputFormat,
@ -45,11 +43,7 @@ import {
issueWithSampleData, issueWithSampleData,
PLACEHOLDER_TIMESTAMP_SPEC, PLACEHOLDER_TIMESTAMP_SPEC,
} from '../../../druid-models'; } from '../../../druid-models';
import { import { executionBackgroundResultStatusCheck } from '../../../helpers';
executionBackgroundResultStatusCheck,
extractResult,
submitTaskQuery,
} from '../../../helpers';
import { useQueryManager } from '../../../hooks'; import { useQueryManager } from '../../../hooks';
import { AppToaster, UrlBaser } from '../../../singletons'; import { AppToaster, UrlBaser } from '../../../singletons';
import { filterMap, IntermediateQueryState } from '../../../utils'; import { filterMap, IntermediateQueryState } from '../../../utils';
@ -61,17 +55,20 @@ import { InputSourceInfo } from './input-source-info';
import './input-source-step.scss'; import './input-source-step.scss';
const BOGUS_LIST_DELIMITER = '56616469-6de2-9da4-efb8-8f416e6e6965'; // Just a UUID to disable the list delimiter, let's hope we do not see this UUID in the data
const ROWS_TO_SAMPLE = 50;
const FIXED_FORMAT_FOR_SOURCE: Record<string, InputFormat> = {
delta: { type: 'parquet' },
};
function resultToInputFormat(result: QueryResult): InputFormat { function resultToInputFormat(result: QueryResult): InputFormat {
if (!result.rows.length) throw new Error('No data returned from sample query'); if (!result.rows.length) throw new Error('No data returned from sample query');
return guessSimpleInputFormat(result.rows.map((r: any) => r[0])); return guessSimpleInputFormat(result.rows.map((r: any) => r[0]));
} }
const BOGUS_LIST_DELIMITER = '56616469-6de2-9da4-efb8-8f416e6e6965'; // Just a UUID to disable the list delimiter, let's hope we do not see this UUID in the data
const ROWS_TO_SAMPLE = 50;
export interface InputSourceStepProps { export interface InputSourceStepProps {
initInputSource: Partial<InputSource> | undefined; initInputSource: Partial<InputSource> | undefined;
mode: 'sampler' | 'msq';
onSet( onSet(
inputSource: InputSource, inputSource: InputSource,
inputFormat: InputFormat, inputFormat: InputFormat,
@ -80,7 +77,7 @@ export interface InputSourceStepProps {
} }
export const InputSourceStep = React.memo(function InputSourceStep(props: InputSourceStepProps) { export const InputSourceStep = React.memo(function InputSourceStep(props: InputSourceStepProps) {
const { initInputSource, mode, onSet } = props; const { initInputSource, onSet } = props;
const [stackToShow, setStackToShow] = useState<string | undefined>(); const [stackToShow, setStackToShow] = useState<string | undefined>();
const [inputSource, setInputSource] = useState<Partial<InputSource> | string | undefined>( const [inputSource, setInputSource] = useState<Partial<InputSource> | string | undefined>(
@ -94,15 +91,17 @@ export const InputSourceStep = React.memo(function InputSourceStep(props: InputS
Execution Execution
>({ >({
processQuery: async ({ inputSource, suggestedInputFormat }, cancelToken) => { processQuery: async ({ inputSource, suggestedInputFormat }, cancelToken) => {
let sampleLines: string[]; const fixedFormat = FIXED_FORMAT_FOR_SOURCE['delta'];
if (mode === 'sampler') {
const sampleSpec: SampleSpec = { const sampleSpec: SampleSpec = {
type: 'index_parallel', type: 'index_parallel',
spec: { spec: {
ioConfig: { ioConfig: {
type: 'index_parallel', type: 'index_parallel',
inputSource, inputSource,
inputFormat: { inputFormat: fixedFormat
? undefined
: {
type: 'regex', type: 'regex',
pattern: '([\\s\\S]*)', // Match the entire line, every single character pattern: '([\\s\\S]*)', // Match the entire line, every single character
listDelimiter: BOGUS_LIST_DELIMITER, listDelimiter: BOGUS_LIST_DELIMITER,
@ -124,36 +123,14 @@ export const InputSourceStep = React.memo(function InputSourceStep(props: InputS
}, },
}; };
const sampleResponse = await postToSampler(sampleSpec, 'input-source-step'); const sampleResponse = await postToSampler(sampleSpec, 'input-source-step', cancelToken);
const sampleLines = filterMap(
sampleLines = filterMap(sampleResponse.data, l => (l.input ? l.input.raw : undefined)); sampleResponse.data,
} else { fixedFormat ? l => l.input : l => (l.input ? l.input.raw : undefined),
const tableExpression = externalConfigToTableExpression({
inputSource,
inputFormat: {
type: 'regex',
pattern: '([\\s\\S]*)',
listDelimiter: BOGUS_LIST_DELIMITER,
columns: ['raw'],
},
signature: [SqlColumnDeclaration.create('raw', 'VARCHAR')],
});
const result = extractResult(
await submitTaskQuery({
query: `SELECT REPLACE(raw, U&'\\0000', '') AS "raw" FROM ${tableExpression}`, // Make sure to remove possible \u0000 chars as they are not allowed and will produce an InvalidNullByte error message
context: {
sqlOuterLimit: ROWS_TO_SAMPLE,
},
cancelToken,
}),
); );
if (result instanceof IntermediateQueryState) return result;
sampleLines = result.rows.map((r: string[]) => r[0]);
}
if (!sampleLines.length) throw new Error('No data returned from sampler'); if (!sampleLines.length) throw new Error('No data returned from sampler');
if (fixedFormat) return fixedFormat;
const issue = issueWithSampleData(sampleLines, false); const issue = issueWithSampleData(sampleLines, false);
if (issue) { if (issue) {
@ -226,6 +203,7 @@ export const InputSourceStep = React.memo(function InputSourceStep(props: InputS
{renderIngestionCard('s3')} {renderIngestionCard('s3')}
{renderIngestionCard('azureStorage')} {renderIngestionCard('azureStorage')}
{renderIngestionCard('google')} {renderIngestionCard('google')}
{renderIngestionCard('delta')}
{renderIngestionCard('hdfs')} {renderIngestionCard('hdfs')}
{renderIngestionCard('http')} {renderIngestionCard('http')}
{renderIngestionCard('local')} {renderIngestionCard('local')}