2020-10-01 18:14:41 -04:00
|
|
|
/*
|
|
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
* or more contributor license agreements. See the NOTICE file
|
|
|
|
* distributed with this work for additional information
|
|
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
|
|
* to you under the Apache License, Version 2.0 (the
|
|
|
|
* "License"); you may not use this file except in compliance
|
|
|
|
* with the License. You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
import path from 'path';
|
2023-02-07 02:12:54 -05:00
|
|
|
import type * as playwright from 'playwright-chromium';
|
2020-10-01 18:14:41 -04:00
|
|
|
|
|
|
|
import { DatasourcesOverview } from './component/datasources/overview';
|
2023-06-14 13:42:30 -04:00
|
|
|
import { TasksOverview } from './component/ingestion/overview';
|
2020-10-01 18:14:41 -04:00
|
|
|
import { ConfigureSchemaConfig } from './component/load-data/config/configure-schema';
|
2021-04-22 22:33:03 -04:00
|
|
|
import {
|
|
|
|
PartitionConfig,
|
2022-01-12 06:50:10 -05:00
|
|
|
RangePartitionsSpec,
|
2021-04-22 22:33:03 -04:00
|
|
|
SegmentGranularity,
|
|
|
|
} from './component/load-data/config/partition';
|
2020-10-01 18:14:41 -04:00
|
|
|
import { PublishConfig } from './component/load-data/config/publish';
|
|
|
|
import { ReindexDataConnector } from './component/load-data/data-connector/reindex';
|
|
|
|
import { DataLoader } from './component/load-data/data-loader';
|
|
|
|
import { saveScreenshotIfError } from './util/debug';
|
2021-04-22 22:33:03 -04:00
|
|
|
import {
|
|
|
|
DRUID_EXAMPLES_QUICKSTART_TUTORIAL_DIR,
|
|
|
|
runIndexTask,
|
|
|
|
UNIFIED_CONSOLE_URL,
|
|
|
|
} from './util/druid';
|
|
|
|
import { createBrowser, createPage } from './util/playwright';
|
2020-10-01 18:14:41 -04:00
|
|
|
import { retryIfJestAssertionError } from './util/retry';
|
|
|
|
import { waitTillWebConsoleReady } from './util/setup';
|
|
|
|
|
|
|
|
jest.setTimeout(5 * 60 * 1000);
|
|
|
|
|
|
|
|
describe('Reindexing from Druid', () => {
|
|
|
|
let browser: playwright.Browser;
|
|
|
|
let page: playwright.Page;
|
|
|
|
|
|
|
|
beforeAll(async () => {
|
|
|
|
await waitTillWebConsoleReady();
|
|
|
|
browser = await createBrowser();
|
|
|
|
});
|
|
|
|
|
|
|
|
beforeEach(async () => {
|
|
|
|
page = await createPage(browser);
|
|
|
|
});
|
|
|
|
|
|
|
|
afterAll(async () => {
|
|
|
|
await browser.close();
|
|
|
|
});
|
|
|
|
|
2022-01-12 06:50:10 -05:00
|
|
|
it('Reindex datasource from dynamic to range partitions', async () => {
|
|
|
|
const testName = 'reindex-dynamic-to-range-';
|
2020-10-01 18:14:41 -04:00
|
|
|
const datasourceName = testName + new Date().toISOString();
|
|
|
|
const interval = '2015-09-12/2015-09-13';
|
|
|
|
const dataConnector = new ReindexDataConnector(page, {
|
|
|
|
datasourceName,
|
|
|
|
interval,
|
|
|
|
});
|
|
|
|
const configureSchemaConfig = new ConfigureSchemaConfig({ rollup: false });
|
|
|
|
const partitionConfig = new PartitionConfig({
|
|
|
|
segmentGranularity: SegmentGranularity.DAY,
|
2020-12-09 13:18:42 -05:00
|
|
|
timeIntervals: null,
|
2022-01-12 06:50:10 -05:00
|
|
|
partitionsSpec: new RangePartitionsSpec({
|
|
|
|
partitionDimensions: ['channel'],
|
2020-10-01 18:14:41 -04:00
|
|
|
targetRowsPerSegment: 10_000,
|
|
|
|
maxRowsPerSegment: null,
|
|
|
|
}),
|
|
|
|
});
|
|
|
|
const publishConfig = new PublishConfig({ datasourceName: datasourceName });
|
|
|
|
|
|
|
|
const dataLoader = new DataLoader({
|
|
|
|
page: page,
|
|
|
|
unifiedConsoleUrl: UNIFIED_CONSOLE_URL,
|
|
|
|
connector: dataConnector,
|
|
|
|
connectValidator: validateConnectLocalData,
|
|
|
|
configureSchemaConfig: configureSchemaConfig,
|
|
|
|
partitionConfig: partitionConfig,
|
|
|
|
publishConfig: publishConfig,
|
|
|
|
});
|
|
|
|
|
|
|
|
loadInitialData(datasourceName);
|
|
|
|
|
|
|
|
await saveScreenshotIfError(testName, page, async () => {
|
|
|
|
const numInitialSegment = 1;
|
|
|
|
await validateDatasourceStatus(page, datasourceName, numInitialSegment);
|
|
|
|
|
|
|
|
await dataLoader.load();
|
|
|
|
await validateTaskStatus(page, datasourceName);
|
|
|
|
|
|
|
|
const numReindexedSegment = 4; // 39k rows into segments of ~10k rows
|
|
|
|
await validateDatasourceStatus(page, datasourceName, numReindexedSegment);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
});
|
|
|
|
|
|
|
|
function loadInitialData(datasourceName: string) {
|
|
|
|
const ingestionSpec = path.join(DRUID_EXAMPLES_QUICKSTART_TUTORIAL_DIR, 'wikipedia-index.json');
|
|
|
|
const setDatasourceName = `s/wikipedia/${datasourceName}/`;
|
|
|
|
const sedCommands = [setDatasourceName];
|
|
|
|
runIndexTask(ingestionSpec, sedCommands);
|
|
|
|
}
|
|
|
|
|
|
|
|
function validateConnectLocalData(preview: string) {
|
|
|
|
const lines = preview.split('\n');
|
|
|
|
expect(lines.length).toBe(500);
|
|
|
|
const firstLine = lines[0];
|
|
|
|
expect(firstLine).toBe(
|
2023-04-07 09:28:29 -04:00
|
|
|
'[Druid row: {' +
|
2020-10-01 18:14:41 -04:00
|
|
|
'"__time":1442018818771' +
|
|
|
|
',"channel":"#en.wikipedia"' +
|
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267)
* DruidInputSource: Fix issues in column projection, timestamp handling.
DruidInputSource, DruidSegmentReader changes:
1) Remove "dimensions" and "metrics". They are not necessary, because we
can compute which columns we need to read based on what is going to
be used by the timestamp, transform, dimensions, and metrics.
2) Start using ColumnsFilter (see below) to decide which columns we need
to read.
3) Actually respect the "timestampSpec". Previously, it was ignored, and
the timestamp of the returned InputRows was set to the `__time` column
of the input datasource.
(1) and (2) together fix a bug in which the DruidInputSource would not
properly read columns that are used as inputs to a transformSpec.
(3) fixes a bug where the timestampSpec would be ignored if you attempted
to set the column to something other than `__time`.
(1) and (3) are breaking changes.
Web console changes:
1) Remove "Dimensions" and "Metrics" from the Druid input source.
2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for
compatibility with the new behavior.
Other changes:
1) Add ColumnsFilter, a new class that allows input readers to determine
which columns they need to read. Currently, it's only used by the
DruidInputSource, but it could be used by other columnar input sources
in the future.
2) Add a ColumnsFilter to InputRowSchema.
3) Remove the metric names from InputRowSchema (they were unused).
4) Add InputRowSchemas.fromDataSchema method that computes the proper
ColumnsFilter for given timestamp, dimensions, transform, and metrics.
5) Add "getRequiredColumns" method to TransformSpec to support the above.
* Various fixups.
* Uncomment incorrectly commented lines.
* Move TransformSpecTest to the proper module.
* Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting.
* Fix.
* Fix build.
* Checkstyle.
* Misc fixes.
* Fix test.
* Move config.
* Fix imports.
* Fixup.
* Fix ShuffleResourceTest.
* Add import.
* Smarter exclusions.
* Fixes based on tests.
Also, add TIME_COLUMN constant in the web console.
* Adjustments for tests.
* Reorder test data.
* Update docs.
* Update docs to say Druid 0.22.0 instead of 0.21.0.
* Fix test.
* Fix ITAutoCompactionTest.
* Changes from review & from merging.
2021-03-25 13:32:21 -04:00
|
|
|
',"comment":"added project"' +
|
2020-10-01 18:14:41 -04:00
|
|
|
',"isAnonymous":"false"' +
|
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267)
* DruidInputSource: Fix issues in column projection, timestamp handling.
DruidInputSource, DruidSegmentReader changes:
1) Remove "dimensions" and "metrics". They are not necessary, because we
can compute which columns we need to read based on what is going to
be used by the timestamp, transform, dimensions, and metrics.
2) Start using ColumnsFilter (see below) to decide which columns we need
to read.
3) Actually respect the "timestampSpec". Previously, it was ignored, and
the timestamp of the returned InputRows was set to the `__time` column
of the input datasource.
(1) and (2) together fix a bug in which the DruidInputSource would not
properly read columns that are used as inputs to a transformSpec.
(3) fixes a bug where the timestampSpec would be ignored if you attempted
to set the column to something other than `__time`.
(1) and (3) are breaking changes.
Web console changes:
1) Remove "Dimensions" and "Metrics" from the Druid input source.
2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for
compatibility with the new behavior.
Other changes:
1) Add ColumnsFilter, a new class that allows input readers to determine
which columns they need to read. Currently, it's only used by the
DruidInputSource, but it could be used by other columnar input sources
in the future.
2) Add a ColumnsFilter to InputRowSchema.
3) Remove the metric names from InputRowSchema (they were unused).
4) Add InputRowSchemas.fromDataSchema method that computes the proper
ColumnsFilter for given timestamp, dimensions, transform, and metrics.
5) Add "getRequiredColumns" method to TransformSpec to support the above.
* Various fixups.
* Uncomment incorrectly commented lines.
* Move TransformSpecTest to the proper module.
* Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting.
* Fix.
* Fix build.
* Checkstyle.
* Misc fixes.
* Fix test.
* Move config.
* Fix imports.
* Fixup.
* Fix ShuffleResourceTest.
* Add import.
* Smarter exclusions.
* Fixes based on tests.
Also, add TIME_COLUMN constant in the web console.
* Adjustments for tests.
* Reorder test data.
* Update docs.
* Update docs to say Druid 0.22.0 instead of 0.21.0.
* Fix test.
* Fix ITAutoCompactionTest.
* Changes from review & from merging.
2021-03-25 13:32:21 -04:00
|
|
|
',"isMinor":"false"' +
|
|
|
|
',"isNew":"false"' +
|
|
|
|
',"isRobot":"false"' +
|
|
|
|
',"isUnpatrolled":"false"' +
|
2020-10-01 18:14:41 -04:00
|
|
|
',"namespace":"Talk"' +
|
|
|
|
',"page":"Talk:Oswald Tilghman"' +
|
|
|
|
',"user":"GELongstreet"' +
|
2021-08-05 15:30:30 -04:00
|
|
|
',"added":36' +
|
|
|
|
',"deleted":0' +
|
|
|
|
',"delta":36' +
|
2023-04-07 09:28:29 -04:00
|
|
|
'}]',
|
2020-10-01 18:14:41 -04:00
|
|
|
);
|
|
|
|
const lastLine = lines[lines.length - 1];
|
|
|
|
expect(lastLine).toBe(
|
2023-04-07 09:28:29 -04:00
|
|
|
'[Druid row: {' +
|
2020-10-01 18:14:41 -04:00
|
|
|
'"__time":1442020314823' +
|
|
|
|
',"channel":"#en.wikipedia"' +
|
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267)
* DruidInputSource: Fix issues in column projection, timestamp handling.
DruidInputSource, DruidSegmentReader changes:
1) Remove "dimensions" and "metrics". They are not necessary, because we
can compute which columns we need to read based on what is going to
be used by the timestamp, transform, dimensions, and metrics.
2) Start using ColumnsFilter (see below) to decide which columns we need
to read.
3) Actually respect the "timestampSpec". Previously, it was ignored, and
the timestamp of the returned InputRows was set to the `__time` column
of the input datasource.
(1) and (2) together fix a bug in which the DruidInputSource would not
properly read columns that are used as inputs to a transformSpec.
(3) fixes a bug where the timestampSpec would be ignored if you attempted
to set the column to something other than `__time`.
(1) and (3) are breaking changes.
Web console changes:
1) Remove "Dimensions" and "Metrics" from the Druid input source.
2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for
compatibility with the new behavior.
Other changes:
1) Add ColumnsFilter, a new class that allows input readers to determine
which columns they need to read. Currently, it's only used by the
DruidInputSource, but it could be used by other columnar input sources
in the future.
2) Add a ColumnsFilter to InputRowSchema.
3) Remove the metric names from InputRowSchema (they were unused).
4) Add InputRowSchemas.fromDataSchema method that computes the proper
ColumnsFilter for given timestamp, dimensions, transform, and metrics.
5) Add "getRequiredColumns" method to TransformSpec to support the above.
* Various fixups.
* Uncomment incorrectly commented lines.
* Move TransformSpecTest to the proper module.
* Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting.
* Fix.
* Fix build.
* Checkstyle.
* Misc fixes.
* Fix test.
* Move config.
* Fix imports.
* Fixup.
* Fix ShuffleResourceTest.
* Add import.
* Smarter exclusions.
* Fixes based on tests.
Also, add TIME_COLUMN constant in the web console.
* Adjustments for tests.
* Reorder test data.
* Update docs.
* Update docs to say Druid 0.22.0 instead of 0.21.0.
* Fix test.
* Fix ITAutoCompactionTest.
* Changes from review & from merging.
2021-03-25 13:32:21 -04:00
|
|
|
',"comment":"/* History */[[WP:AWB/T|Typo fixing]], [[WP:AWB/T|typo(s) fixed]]: nothern → northern using [[Project:AWB|AWB]]"' +
|
2020-10-01 18:14:41 -04:00
|
|
|
',"isAnonymous":"false"' +
|
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267)
* DruidInputSource: Fix issues in column projection, timestamp handling.
DruidInputSource, DruidSegmentReader changes:
1) Remove "dimensions" and "metrics". They are not necessary, because we
can compute which columns we need to read based on what is going to
be used by the timestamp, transform, dimensions, and metrics.
2) Start using ColumnsFilter (see below) to decide which columns we need
to read.
3) Actually respect the "timestampSpec". Previously, it was ignored, and
the timestamp of the returned InputRows was set to the `__time` column
of the input datasource.
(1) and (2) together fix a bug in which the DruidInputSource would not
properly read columns that are used as inputs to a transformSpec.
(3) fixes a bug where the timestampSpec would be ignored if you attempted
to set the column to something other than `__time`.
(1) and (3) are breaking changes.
Web console changes:
1) Remove "Dimensions" and "Metrics" from the Druid input source.
2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for
compatibility with the new behavior.
Other changes:
1) Add ColumnsFilter, a new class that allows input readers to determine
which columns they need to read. Currently, it's only used by the
DruidInputSource, but it could be used by other columnar input sources
in the future.
2) Add a ColumnsFilter to InputRowSchema.
3) Remove the metric names from InputRowSchema (they were unused).
4) Add InputRowSchemas.fromDataSchema method that computes the proper
ColumnsFilter for given timestamp, dimensions, transform, and metrics.
5) Add "getRequiredColumns" method to TransformSpec to support the above.
* Various fixups.
* Uncomment incorrectly commented lines.
* Move TransformSpecTest to the proper module.
* Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting.
* Fix.
* Fix build.
* Checkstyle.
* Misc fixes.
* Fix test.
* Move config.
* Fix imports.
* Fixup.
* Fix ShuffleResourceTest.
* Add import.
* Smarter exclusions.
* Fixes based on tests.
Also, add TIME_COLUMN constant in the web console.
* Adjustments for tests.
* Reorder test data.
* Update docs.
* Update docs to say Druid 0.22.0 instead of 0.21.0.
* Fix test.
* Fix ITAutoCompactionTest.
* Changes from review & from merging.
2021-03-25 13:32:21 -04:00
|
|
|
',"isMinor":"true"' +
|
|
|
|
',"isNew":"false"' +
|
|
|
|
',"isRobot":"false"' +
|
|
|
|
',"isUnpatrolled":"false"' +
|
2020-10-01 18:14:41 -04:00
|
|
|
',"namespace":"Main"' +
|
|
|
|
',"page":"Hapoel Katamon Jerusalem F.C."' +
|
|
|
|
',"user":"The Quixotic Potato"' +
|
2021-08-05 15:30:30 -04:00
|
|
|
',"added":1' +
|
|
|
|
',"deleted":0' +
|
|
|
|
',"delta":1' +
|
2023-04-07 09:28:29 -04:00
|
|
|
'}]',
|
2020-10-01 18:14:41 -04:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
async function validateTaskStatus(page: playwright.Page, datasourceName: string) {
|
2023-06-14 13:42:30 -04:00
|
|
|
const tasksOverview = new TasksOverview(page, UNIFIED_CONSOLE_URL);
|
2020-10-01 18:14:41 -04:00
|
|
|
|
|
|
|
await retryIfJestAssertionError(async () => {
|
2023-06-14 13:42:30 -04:00
|
|
|
const tasks = await tasksOverview.getTasks();
|
2020-10-01 18:14:41 -04:00
|
|
|
const task = tasks.find(t => t.datasource === datasourceName);
|
|
|
|
expect(task).toBeDefined();
|
|
|
|
expect(task!.status).toMatch('SUCCESS');
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async function validateDatasourceStatus(
|
|
|
|
page: playwright.Page,
|
|
|
|
datasourceName: string,
|
|
|
|
expectedNumSegment: number,
|
|
|
|
) {
|
|
|
|
const datasourcesOverview = new DatasourcesOverview(page, UNIFIED_CONSOLE_URL);
|
|
|
|
const numSegmentString = `${expectedNumSegment} segment` + (expectedNumSegment !== 1 ? 's' : '');
|
|
|
|
|
|
|
|
await retryIfJestAssertionError(async () => {
|
|
|
|
const datasources = await datasourcesOverview.getDatasources();
|
|
|
|
const datasource = datasources.find(t => t.name === datasourceName);
|
|
|
|
expect(datasource).toBeDefined();
|
|
|
|
expect(datasource!.availability).toMatch(`Fully available (${numSegmentString})`);
|
|
|
|
expect(datasource!.totalRows).toBe(39244);
|
|
|
|
});
|
|
|
|
}
|