mirror of
https://github.com/apache/druid.git
synced 2025-02-22 18:30:13 +00:00
Web console: add arrayOfDoublesSketch and other small fixes (#13486)
* add padding and keywords * add arrayOfDoubles * Update docs/development/extensions-core/datasketches-tuple.md Co-authored-by: Charles Smith <techdocsmith@gmail.com> * Update docs/development/extensions-core/datasketches-tuple.md Co-authored-by: Charles Smith <techdocsmith@gmail.com> * Update docs/development/extensions-core/datasketches-tuple.md Co-authored-by: Charles Smith <techdocsmith@gmail.com> * Update docs/development/extensions-core/datasketches-tuple.md Co-authored-by: Charles Smith <techdocsmith@gmail.com> * Update docs/development/extensions-core/datasketches-tuple.md Co-authored-by: Charles Smith <techdocsmith@gmail.com> * partiton int * fix docs Co-authored-by: Charles Smith <techdocsmith@gmail.com>
This commit is contained in:
parent
c7229fc787
commit
9679f6a9b5
@ -39,19 +39,52 @@ druid.extensions.loadList=["druid-datasketches"]
|
||||
"name" : <output_name>,
|
||||
"fieldName" : <metric_name>,
|
||||
"nominalEntries": <number>,
|
||||
"numberOfValues" : <number>,
|
||||
"metricColumns" : <array of strings>
|
||||
"metricColumns" : <array of strings>,
|
||||
"numberOfValues" : <number>
|
||||
}
|
||||
```
|
||||
|
||||
|property|description|required?|
|
||||
|--------|-----------|---------|
|
||||
|type|This String should always be "arrayOfDoublesSketch"|yes|
|
||||
|name|A String for the output (result) name of the calculation.|yes|
|
||||
|name|String representing the output column to store sketch values.|yes|
|
||||
|fieldName|A String for the name of the input field.|yes|
|
||||
|nominalEntries|Parameter that determines the accuracy and size of the sketch. Higher k means higher accuracy but more space to store sketches. Must be a power of 2. See the [Theta sketch accuracy](https://datasketches.apache.org/docs/Theta/ThetaErrorTable) for details. |no, defaults to 16384|
|
||||
|numberOfValues|Number of values associated with each distinct key. |no, defaults to 1|
|
||||
|metricColumns|If building sketches from raw data, an array of names of the input columns containing numeric values to be associated with each distinct key.|no, defaults to empty array|
|
||||
|metricColumns|When building sketches from raw data, an array input column that contain numeric values to associate with each distinct key. If not provided, assumes `fieldName` is an `arrayOfDoublesSketch`|no, if not provided `fieldName` is assumed to be an arrayOfDoublesSketch|
|
||||
|numberOfValues|Number of values associated with each distinct key. |no, defaults to the length of `metricColumns` if provided and 1 otherwise|
|
||||
|
||||
You can use the `arrayOfDoublesSketch` aggregator to:
|
||||
|
||||
- Build a sketch from raw data. In this case, set `metricColumns` to an array.
|
||||
- Build a sketch from an existing `ArrayOfDoubles` sketch . In this case, leave `metricColumns` unset and set the `fieldName` to an `ArrayOfDoubles` sketch with `numberOfValues` doubles. At ingestion time, you must base64 encode `ArrayOfDoubles` sketches at ingestion time.
|
||||
|
||||
#### Example on top of raw data
|
||||
|
||||
Compute a theta of unique users. For each user store the `added` and `deleted` scores. The new sketch column will be called `users_theta`.
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "arrayOfDoublesSketch",
|
||||
"name": "users_theta",
|
||||
"fieldName": "user",
|
||||
"nominalEntries": 16384,
|
||||
"metricColumns": ["added", "deleted"],
|
||||
}
|
||||
```
|
||||
|
||||
#### Example ingesting a precomputed sketch column
|
||||
|
||||
Ingest a sketch column called `user_sketches` that has a base64 encoded value of two doubles in its array and store it in a column called `users_theta`.
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "arrayOfDoublesSketch",
|
||||
"name": "users_theta",
|
||||
"fieldName": "user_sketches",
|
||||
"nominalEntries": 16384,
|
||||
"numberOfValues": 2,
|
||||
}
|
||||
```
|
||||
|
||||
### Post Aggregators
|
||||
|
||||
|
@ -61,6 +61,9 @@ exports.SQL_KEYWORDS = [
|
||||
'REPLACE INTO',
|
||||
'OVERWRITE',
|
||||
'RETURNING',
|
||||
'OVER',
|
||||
'PARTITION BY',
|
||||
'WINDOW',
|
||||
];
|
||||
|
||||
exports.SQL_EXPRESSION_PARTS = [
|
||||
|
@ -52,9 +52,7 @@ function convertMarkdownToHtml(markdown) {
|
||||
// Concert to markdown
|
||||
markdown = snarkdown(markdown);
|
||||
|
||||
return markdown
|
||||
.replace(/<br \/>/g, '<br /><br />') // Double up the <br>s
|
||||
.replace(/<a[^>]*>(.*?)<\/a>/g, '$1'); // Remove links
|
||||
return markdown.replace(/<a[^>]*>(.*?)<\/a>/g, '$1'); // Remove links
|
||||
}
|
||||
|
||||
const readDoc = async () => {
|
||||
|
@ -53,12 +53,12 @@ export function bootstrapReactTable() {
|
||||
.map((row: any) => row[column.id]);
|
||||
const previewCount = countBy(previewValues);
|
||||
return (
|
||||
<span>
|
||||
<div className="default-aggregated">
|
||||
{Object.keys(previewCount)
|
||||
.sort()
|
||||
.map(v => `${v} (${previewCount[v]})`)
|
||||
.join(', ')}
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
},
|
||||
defaultPageSize: 20,
|
||||
|
@ -278,7 +278,7 @@ ORDER BY "start" DESC`;
|
||||
intervals = await queryDruidSql({
|
||||
query: SegmentTimeline.getSqlQuery(startDate, endDate),
|
||||
});
|
||||
datasources = uniq(intervals.map(r => r.datasource));
|
||||
datasources = uniq(intervals.map(r => r.datasource).sort());
|
||||
} else if (capabilities.hasCoordinatorAccess()) {
|
||||
const startIso = startDate.toISOString();
|
||||
|
||||
|
@ -63,6 +63,7 @@ export function externalConfigToIngestQueryPattern(
|
||||
config: ExternalConfig,
|
||||
isArrays: boolean[],
|
||||
timeExpression: SqlExpression | undefined,
|
||||
partitionedByHint: string | undefined,
|
||||
): IngestQueryPattern {
|
||||
return {
|
||||
destinationTableName: guessDataSourceNameFromInputSource(config.inputSource) || 'data',
|
||||
@ -71,7 +72,7 @@ export function externalConfigToIngestQueryPattern(
|
||||
mainExternalConfig: config,
|
||||
filters: [],
|
||||
dimensions: externalConfigToInitDimensions(config, isArrays, timeExpression),
|
||||
partitionedBy: timeExpression ? 'day' : 'all',
|
||||
partitionedBy: partitionedByHint || (timeExpression ? 'day' : 'all'),
|
||||
clusteredBy: [],
|
||||
};
|
||||
}
|
||||
|
@ -78,6 +78,7 @@ export const METRIC_SPEC_FIELDS: Field<MetricSpec>[] = [
|
||||
// Should the first / last aggregators become usable at ingestion time, reverse the changes made in:
|
||||
// https://github.com/apache/druid/pull/10794
|
||||
'thetaSketch',
|
||||
'arrayOfDoublesSketch',
|
||||
{
|
||||
group: 'HLLSketch',
|
||||
suggestions: ['HLLSketchBuild', 'HLLSketchMerge'],
|
||||
@ -104,6 +105,7 @@ export const METRIC_SPEC_FIELDS: Field<MetricSpec>[] = [
|
||||
'doubleMax',
|
||||
'floatMax',
|
||||
'thetaSketch',
|
||||
'arrayOfDoublesSketch',
|
||||
'HLLSketchBuild',
|
||||
'HLLSketchMerge',
|
||||
'quantilesDoublesSketch',
|
||||
@ -178,6 +180,47 @@ export const METRIC_SPEC_FIELDS: Field<MetricSpec>[] = [
|
||||
</>
|
||||
),
|
||||
},
|
||||
// arrayOfDoublesSketch
|
||||
{
|
||||
name: 'nominalEntries',
|
||||
type: 'number',
|
||||
defined: typeIs('arrayOfDoublesSketch'),
|
||||
defaultValue: 16384,
|
||||
info: (
|
||||
<>
|
||||
<p>
|
||||
Parameter that determines the accuracy and size of the sketch. Higher k means higher
|
||||
accuracy but more space to store sketches.
|
||||
</p>
|
||||
<p>Must be a power of 2.</p>
|
||||
<p>
|
||||
See the{' '}
|
||||
<ExternalLink href="https://datasketches.apache.org/docs/Theta/ThetaErrorTable">
|
||||
Theta sketch accuracy
|
||||
</ExternalLink>{' '}
|
||||
for details.
|
||||
</p>
|
||||
</>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'metricColumns',
|
||||
type: 'string-array',
|
||||
defined: typeIs('arrayOfDoublesSketch'),
|
||||
info: (
|
||||
<>
|
||||
If building sketches from raw data, an array of names of the input columns containing
|
||||
numeric values to be associated with each distinct key.
|
||||
</>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'numberOfValues',
|
||||
type: 'number',
|
||||
defined: typeIs('arrayOfDoublesSketch'),
|
||||
placeholder: 'metricColumns length or 1',
|
||||
info: <>Number of values associated with each distinct key.</>,
|
||||
},
|
||||
// HLLSketchBuild & HLLSketchMerge
|
||||
{
|
||||
name: 'lgK',
|
||||
|
@ -82,13 +82,19 @@ export class WorkbenchQuery {
|
||||
externalConfig: ExternalConfig,
|
||||
isArrays: boolean[],
|
||||
timeExpression: SqlExpression | undefined,
|
||||
partitionedByHint: string | undefined,
|
||||
): WorkbenchQuery {
|
||||
return new WorkbenchQuery({
|
||||
queryContext: {},
|
||||
queryParts: [
|
||||
WorkbenchQueryPart.fromQueryString(
|
||||
ingestQueryPatternToQuery(
|
||||
externalConfigToIngestQueryPattern(externalConfig, isArrays, timeExpression),
|
||||
externalConfigToIngestQueryPattern(
|
||||
externalConfig,
|
||||
isArrays,
|
||||
timeExpression,
|
||||
partitionedByHint,
|
||||
),
|
||||
).toString(),
|
||||
),
|
||||
],
|
||||
|
@ -124,9 +124,14 @@ export async function reattachTaskExecution(
|
||||
option: ReattachTaskQueryOptions,
|
||||
): Promise<Execution | IntermediateQueryState<Execution>> {
|
||||
const { id, cancelToken, preserveOnTermination } = option;
|
||||
let execution = await getTaskExecution(id, undefined, cancelToken);
|
||||
let execution: Execution;
|
||||
|
||||
execution = await updateExecutionWithDatasourceExistsIfNeeded(execution, cancelToken);
|
||||
try {
|
||||
execution = await getTaskExecution(id, undefined, cancelToken);
|
||||
execution = await updateExecutionWithDatasourceExistsIfNeeded(execution, cancelToken);
|
||||
} catch (e) {
|
||||
throw new Error(`Reattaching to query failed due to: ${e.message}`);
|
||||
}
|
||||
|
||||
if (execution.isFullyComplete()) return execution;
|
||||
|
||||
|
@ -45,4 +45,8 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.default-aggregated {
|
||||
padding: 10px 5px;
|
||||
}
|
||||
}
|
||||
|
@ -151,6 +151,7 @@ export const SqlDataLoaderView = React.memo(function SqlDataLoaderView(
|
||||
{ inputSource, inputFormat, signature },
|
||||
isArrays,
|
||||
timeExpression,
|
||||
undefined,
|
||||
),
|
||||
).toString(),
|
||||
queryContext: {
|
||||
@ -167,6 +168,7 @@ export const SqlDataLoaderView = React.memo(function SqlDataLoaderView(
|
||||
{ inputSource, inputFormat, signature },
|
||||
isArrays,
|
||||
timeExpression,
|
||||
undefined,
|
||||
),
|
||||
).toString(),
|
||||
});
|
||||
|
@ -20,7 +20,7 @@ import { Classes, Dialog } from '@blueprintjs/core';
|
||||
import { SqlExpression } from 'druid-query-toolkit';
|
||||
import React, { useState } from 'react';
|
||||
|
||||
import { ExternalConfig } from '../../../druid-models';
|
||||
import { ExternalConfig, InputFormat, InputSource } from '../../../druid-models';
|
||||
import { InputFormatStep } from '../input-format-step/input-format-step';
|
||||
import { InputSourceStep } from '../input-source-step/input-source-step';
|
||||
|
||||
@ -32,20 +32,27 @@ export interface ConnectExternalDataDialogProps {
|
||||
config: ExternalConfig,
|
||||
isArrays: boolean[],
|
||||
timeExpression: SqlExpression | undefined,
|
||||
partitionedByHint: string | undefined,
|
||||
): void;
|
||||
onClose(): void;
|
||||
}
|
||||
|
||||
interface ExternalConfigStep {
|
||||
inputSource?: InputSource;
|
||||
inputFormat?: InputFormat;
|
||||
partitionedByHint?: string;
|
||||
}
|
||||
|
||||
export const ConnectExternalDataDialog = React.memo(function ConnectExternalDataDialog(
|
||||
props: ConnectExternalDataDialogProps,
|
||||
) {
|
||||
const { initExternalConfig, onClose, onSetExternalConfig } = props;
|
||||
|
||||
const [externalConfigStep, setExternalConfigStep] = useState<Partial<ExternalConfig>>(
|
||||
const [externalConfigStep, setExternalConfigStep] = useState<ExternalConfigStep>(
|
||||
initExternalConfig || {},
|
||||
);
|
||||
|
||||
const { inputSource, inputFormat } = externalConfigStep;
|
||||
const { inputSource, inputFormat, partitionedByHint } = externalConfigStep;
|
||||
|
||||
return (
|
||||
<Dialog
|
||||
@ -65,6 +72,7 @@ export const ConnectExternalDataDialog = React.memo(function ConnectExternalData
|
||||
{ inputSource, inputFormat, signature },
|
||||
isArrays,
|
||||
timeExpression,
|
||||
partitionedByHint,
|
||||
);
|
||||
onClose();
|
||||
}}
|
||||
@ -76,8 +84,8 @@ export const ConnectExternalDataDialog = React.memo(function ConnectExternalData
|
||||
<InputSourceStep
|
||||
initInputSource={inputSource}
|
||||
mode="sampler"
|
||||
onSet={(inputSource, inputFormat) => {
|
||||
setExternalConfigStep({ inputSource, inputFormat });
|
||||
onSet={(inputSource, inputFormat, partitionedByHint) => {
|
||||
setExternalConfigStep({ inputSource, inputFormat, partitionedByHint });
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
|
@ -23,6 +23,7 @@ export interface ExampleInput {
|
||||
description: string;
|
||||
inputSource: InputSource;
|
||||
inputFormat?: InputFormat;
|
||||
partitionedByHint?: string;
|
||||
}
|
||||
|
||||
const TRIPS_INPUT_FORMAT: InputFormat = {
|
||||
@ -122,6 +123,7 @@ export const EXAMPLE_INPUTS: ExampleInput[] = [
|
||||
],
|
||||
},
|
||||
inputFormat: TRIPS_INPUT_FORMAT,
|
||||
partitionedByHint: 'month',
|
||||
},
|
||||
{
|
||||
name: 'NYC Taxi cabs (all files)',
|
||||
@ -206,6 +208,7 @@ export const EXAMPLE_INPUTS: ExampleInput[] = [
|
||||
],
|
||||
},
|
||||
inputFormat: TRIPS_INPUT_FORMAT,
|
||||
partitionedByHint: 'month',
|
||||
},
|
||||
{
|
||||
name: 'FlightCarrierOnTime (1 month)',
|
||||
|
@ -71,7 +71,11 @@ const ROWS_TO_SAMPLE = 50;
|
||||
export interface InputSourceStepProps {
|
||||
initInputSource: Partial<InputSource> | undefined;
|
||||
mode: 'sampler' | 'msq';
|
||||
onSet(inputSource: InputSource, inputFormat: InputFormat): void;
|
||||
onSet(
|
||||
inputSource: InputSource,
|
||||
inputFormat: InputFormat,
|
||||
partitionedByHint: string | undefined,
|
||||
): void;
|
||||
}
|
||||
|
||||
export const InputSourceStep = React.memo(function InputSourceStep(props: InputSourceStepProps) {
|
||||
@ -169,7 +173,11 @@ export const InputSourceStep = React.memo(function InputSourceStep(props: InputS
|
||||
useEffect(() => {
|
||||
const guessedInputFormat = guessedInputFormatState.data;
|
||||
if (!guessedInputFormat) return;
|
||||
onSet(exampleInput?.inputSource || (inputSource as any), guessedInputFormat);
|
||||
onSet(
|
||||
exampleInput?.inputSource || (inputSource as any),
|
||||
guessedInputFormat,
|
||||
exampleInput?.partitionedByHint,
|
||||
);
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [guessedInputFormatState]);
|
||||
|
||||
|
@ -324,9 +324,14 @@ export class WorkbenchView extends React.PureComponent<WorkbenchViewProps, Workb
|
||||
|
||||
return (
|
||||
<ConnectExternalDataDialog
|
||||
onSetExternalConfig={(externalConfig, isArrays, timeExpression) => {
|
||||
onSetExternalConfig={(externalConfig, isArrays, timeExpression, partitionedByHint) => {
|
||||
this.handleNewTab(
|
||||
WorkbenchQuery.fromInitExternalConfig(externalConfig, isArrays, timeExpression),
|
||||
WorkbenchQuery.fromInitExternalConfig(
|
||||
externalConfig,
|
||||
isArrays,
|
||||
timeExpression,
|
||||
partitionedByHint,
|
||||
),
|
||||
'Ext ' + guessDataSourceNameFromInputSource(externalConfig.inputSource),
|
||||
);
|
||||
}}
|
||||
|
Loading…
x
Reference in New Issue
Block a user