druid/web-console/e2e-tests/reindexing.spec.ts

184 lines
6.3 KiB
TypeScript
Raw Normal View History

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import path from 'path';
import * as playwright from 'playwright-chromium';
import { DatasourcesOverview } from './component/datasources/overview';
import { IngestionOverview } from './component/ingestion/overview';
import { ConfigureSchemaConfig } from './component/load-data/config/configure-schema';
import {
PartitionConfig,
RangePartitionsSpec,
SegmentGranularity,
} from './component/load-data/config/partition';
import { PublishConfig } from './component/load-data/config/publish';
import { ReindexDataConnector } from './component/load-data/data-connector/reindex';
import { DataLoader } from './component/load-data/data-loader';
import { saveScreenshotIfError } from './util/debug';
import {
DRUID_EXAMPLES_QUICKSTART_TUTORIAL_DIR,
runIndexTask,
UNIFIED_CONSOLE_URL,
} from './util/druid';
import { createBrowser, createPage } from './util/playwright';
import { retryIfJestAssertionError } from './util/retry';
import { waitTillWebConsoleReady } from './util/setup';
jest.setTimeout(5 * 60 * 1000);
describe('Reindexing from Druid', () => {
let browser: playwright.Browser;
let page: playwright.Page;
beforeAll(async () => {
await waitTillWebConsoleReady();
browser = await createBrowser();
});
beforeEach(async () => {
page = await createPage(browser);
});
afterAll(async () => {
await browser.close();
});
it('Reindex datasource from dynamic to range partitions', async () => {
const testName = 'reindex-dynamic-to-range-';
const datasourceName = testName + new Date().toISOString();
const interval = '2015-09-12/2015-09-13';
const dataConnector = new ReindexDataConnector(page, {
datasourceName,
interval,
});
const configureSchemaConfig = new ConfigureSchemaConfig({ rollup: false });
const partitionConfig = new PartitionConfig({
segmentGranularity: SegmentGranularity.DAY,
timeIntervals: null,
partitionsSpec: new RangePartitionsSpec({
partitionDimensions: ['channel'],
targetRowsPerSegment: 10_000,
maxRowsPerSegment: null,
}),
});
const publishConfig = new PublishConfig({ datasourceName: datasourceName });
const dataLoader = new DataLoader({
page: page,
unifiedConsoleUrl: UNIFIED_CONSOLE_URL,
connector: dataConnector,
connectValidator: validateConnectLocalData,
configureSchemaConfig: configureSchemaConfig,
partitionConfig: partitionConfig,
publishConfig: publishConfig,
});
loadInitialData(datasourceName);
await saveScreenshotIfError(testName, page, async () => {
const numInitialSegment = 1;
await validateDatasourceStatus(page, datasourceName, numInitialSegment);
await dataLoader.load();
await validateTaskStatus(page, datasourceName);
const numReindexedSegment = 4; // 39k rows into segments of ~10k rows
await validateDatasourceStatus(page, datasourceName, numReindexedSegment);
});
});
});
function loadInitialData(datasourceName: string) {
const ingestionSpec = path.join(DRUID_EXAMPLES_QUICKSTART_TUTORIAL_DIR, 'wikipedia-index.json');
const setDatasourceName = `s/wikipedia/${datasourceName}/`;
const sedCommands = [setDatasourceName];
runIndexTask(ingestionSpec, sedCommands);
}
function validateConnectLocalData(preview: string) {
const lines = preview.split('\n');
expect(lines.length).toBe(500);
const firstLine = lines[0];
expect(firstLine).toBe(
'Druid row: {' +
'"__time":1442018818771' +
',"channel":"#en.wikipedia"' +
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267) * DruidInputSource: Fix issues in column projection, timestamp handling. DruidInputSource, DruidSegmentReader changes: 1) Remove "dimensions" and "metrics". They are not necessary, because we can compute which columns we need to read based on what is going to be used by the timestamp, transform, dimensions, and metrics. 2) Start using ColumnsFilter (see below) to decide which columns we need to read. 3) Actually respect the "timestampSpec". Previously, it was ignored, and the timestamp of the returned InputRows was set to the `__time` column of the input datasource. (1) and (2) together fix a bug in which the DruidInputSource would not properly read columns that are used as inputs to a transformSpec. (3) fixes a bug where the timestampSpec would be ignored if you attempted to set the column to something other than `__time`. (1) and (3) are breaking changes. Web console changes: 1) Remove "Dimensions" and "Metrics" from the Druid input source. 2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for compatibility with the new behavior. Other changes: 1) Add ColumnsFilter, a new class that allows input readers to determine which columns they need to read. Currently, it's only used by the DruidInputSource, but it could be used by other columnar input sources in the future. 2) Add a ColumnsFilter to InputRowSchema. 3) Remove the metric names from InputRowSchema (they were unused). 4) Add InputRowSchemas.fromDataSchema method that computes the proper ColumnsFilter for given timestamp, dimensions, transform, and metrics. 5) Add "getRequiredColumns" method to TransformSpec to support the above. * Various fixups. * Uncomment incorrectly commented lines. * Move TransformSpecTest to the proper module. * Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting. * Fix. * Fix build. * Checkstyle. * Misc fixes. * Fix test. * Move config. * Fix imports. * Fixup. * Fix ShuffleResourceTest. * Add import. * Smarter exclusions. * Fixes based on tests. Also, add TIME_COLUMN constant in the web console. * Adjustments for tests. * Reorder test data. * Update docs. * Update docs to say Druid 0.22.0 instead of 0.21.0. * Fix test. * Fix ITAutoCompactionTest. * Changes from review & from merging.
2021-03-25 13:32:21 -04:00
',"comment":"added project"' +
',"isAnonymous":"false"' +
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267) * DruidInputSource: Fix issues in column projection, timestamp handling. DruidInputSource, DruidSegmentReader changes: 1) Remove "dimensions" and "metrics". They are not necessary, because we can compute which columns we need to read based on what is going to be used by the timestamp, transform, dimensions, and metrics. 2) Start using ColumnsFilter (see below) to decide which columns we need to read. 3) Actually respect the "timestampSpec". Previously, it was ignored, and the timestamp of the returned InputRows was set to the `__time` column of the input datasource. (1) and (2) together fix a bug in which the DruidInputSource would not properly read columns that are used as inputs to a transformSpec. (3) fixes a bug where the timestampSpec would be ignored if you attempted to set the column to something other than `__time`. (1) and (3) are breaking changes. Web console changes: 1) Remove "Dimensions" and "Metrics" from the Druid input source. 2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for compatibility with the new behavior. Other changes: 1) Add ColumnsFilter, a new class that allows input readers to determine which columns they need to read. Currently, it's only used by the DruidInputSource, but it could be used by other columnar input sources in the future. 2) Add a ColumnsFilter to InputRowSchema. 3) Remove the metric names from InputRowSchema (they were unused). 4) Add InputRowSchemas.fromDataSchema method that computes the proper ColumnsFilter for given timestamp, dimensions, transform, and metrics. 5) Add "getRequiredColumns" method to TransformSpec to support the above. * Various fixups. * Uncomment incorrectly commented lines. * Move TransformSpecTest to the proper module. * Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting. * Fix. * Fix build. * Checkstyle. * Misc fixes. * Fix test. * Move config. * Fix imports. * Fixup. * Fix ShuffleResourceTest. * Add import. * Smarter exclusions. * Fixes based on tests. Also, add TIME_COLUMN constant in the web console. * Adjustments for tests. * Reorder test data. * Update docs. * Update docs to say Druid 0.22.0 instead of 0.21.0. * Fix test. * Fix ITAutoCompactionTest. * Changes from review & from merging.
2021-03-25 13:32:21 -04:00
',"isMinor":"false"' +
',"isNew":"false"' +
',"isRobot":"false"' +
',"isUnpatrolled":"false"' +
',"namespace":"Talk"' +
',"page":"Talk:Oswald Tilghman"' +
',"user":"GELongstreet"' +
',"added":36' +
',"deleted":0' +
',"delta":36' +
'}',
);
const lastLine = lines[lines.length - 1];
expect(lastLine).toBe(
'Druid row: {' +
'"__time":1442020314823' +
',"channel":"#en.wikipedia"' +
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267) * DruidInputSource: Fix issues in column projection, timestamp handling. DruidInputSource, DruidSegmentReader changes: 1) Remove "dimensions" and "metrics". They are not necessary, because we can compute which columns we need to read based on what is going to be used by the timestamp, transform, dimensions, and metrics. 2) Start using ColumnsFilter (see below) to decide which columns we need to read. 3) Actually respect the "timestampSpec". Previously, it was ignored, and the timestamp of the returned InputRows was set to the `__time` column of the input datasource. (1) and (2) together fix a bug in which the DruidInputSource would not properly read columns that are used as inputs to a transformSpec. (3) fixes a bug where the timestampSpec would be ignored if you attempted to set the column to something other than `__time`. (1) and (3) are breaking changes. Web console changes: 1) Remove "Dimensions" and "Metrics" from the Druid input source. 2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for compatibility with the new behavior. Other changes: 1) Add ColumnsFilter, a new class that allows input readers to determine which columns they need to read. Currently, it's only used by the DruidInputSource, but it could be used by other columnar input sources in the future. 2) Add a ColumnsFilter to InputRowSchema. 3) Remove the metric names from InputRowSchema (they were unused). 4) Add InputRowSchemas.fromDataSchema method that computes the proper ColumnsFilter for given timestamp, dimensions, transform, and metrics. 5) Add "getRequiredColumns" method to TransformSpec to support the above. * Various fixups. * Uncomment incorrectly commented lines. * Move TransformSpecTest to the proper module. * Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting. * Fix. * Fix build. * Checkstyle. * Misc fixes. * Fix test. * Move config. * Fix imports. * Fixup. * Fix ShuffleResourceTest. * Add import. * Smarter exclusions. * Fixes based on tests. Also, add TIME_COLUMN constant in the web console. * Adjustments for tests. * Reorder test data. * Update docs. * Update docs to say Druid 0.22.0 instead of 0.21.0. * Fix test. * Fix ITAutoCompactionTest. * Changes from review & from merging.
2021-03-25 13:32:21 -04:00
',"comment":"/* History */[[WP:AWB/T|Typo fixing]], [[WP:AWB/T|typo(s) fixed]]: nothern → northern using [[Project:AWB|AWB]]"' +
',"isAnonymous":"false"' +
DruidInputSource: Fix issues in column projection, timestamp handling. (#10267) * DruidInputSource: Fix issues in column projection, timestamp handling. DruidInputSource, DruidSegmentReader changes: 1) Remove "dimensions" and "metrics". They are not necessary, because we can compute which columns we need to read based on what is going to be used by the timestamp, transform, dimensions, and metrics. 2) Start using ColumnsFilter (see below) to decide which columns we need to read. 3) Actually respect the "timestampSpec". Previously, it was ignored, and the timestamp of the returned InputRows was set to the `__time` column of the input datasource. (1) and (2) together fix a bug in which the DruidInputSource would not properly read columns that are used as inputs to a transformSpec. (3) fixes a bug where the timestampSpec would be ignored if you attempted to set the column to something other than `__time`. (1) and (3) are breaking changes. Web console changes: 1) Remove "Dimensions" and "Metrics" from the Druid input source. 2) Set timestampSpec to `{"column": "__time", "format": "millis"}` for compatibility with the new behavior. Other changes: 1) Add ColumnsFilter, a new class that allows input readers to determine which columns they need to read. Currently, it's only used by the DruidInputSource, but it could be used by other columnar input sources in the future. 2) Add a ColumnsFilter to InputRowSchema. 3) Remove the metric names from InputRowSchema (they were unused). 4) Add InputRowSchemas.fromDataSchema method that computes the proper ColumnsFilter for given timestamp, dimensions, transform, and metrics. 5) Add "getRequiredColumns" method to TransformSpec to support the above. * Various fixups. * Uncomment incorrectly commented lines. * Move TransformSpecTest to the proper module. * Add druid.indexer.task.ignoreTimestampSpecForDruidInputSource setting. * Fix. * Fix build. * Checkstyle. * Misc fixes. * Fix test. * Move config. * Fix imports. * Fixup. * Fix ShuffleResourceTest. * Add import. * Smarter exclusions. * Fixes based on tests. Also, add TIME_COLUMN constant in the web console. * Adjustments for tests. * Reorder test data. * Update docs. * Update docs to say Druid 0.22.0 instead of 0.21.0. * Fix test. * Fix ITAutoCompactionTest. * Changes from review & from merging.
2021-03-25 13:32:21 -04:00
',"isMinor":"true"' +
',"isNew":"false"' +
',"isRobot":"false"' +
',"isUnpatrolled":"false"' +
',"namespace":"Main"' +
',"page":"Hapoel Katamon Jerusalem F.C."' +
',"user":"The Quixotic Potato"' +
',"added":1' +
',"deleted":0' +
',"delta":1' +
'}',
);
}
async function validateTaskStatus(page: playwright.Page, datasourceName: string) {
const ingestionOverview = new IngestionOverview(page, UNIFIED_CONSOLE_URL);
await retryIfJestAssertionError(async () => {
const tasks = await ingestionOverview.getTasks();
const task = tasks.find(t => t.datasource === datasourceName);
expect(task).toBeDefined();
expect(task!.status).toMatch('SUCCESS');
});
}
async function validateDatasourceStatus(
page: playwright.Page,
datasourceName: string,
expectedNumSegment: number,
) {
const datasourcesOverview = new DatasourcesOverview(page, UNIFIED_CONSOLE_URL);
const numSegmentString = `${expectedNumSegment} segment` + (expectedNumSegment !== 1 ? 's' : '');
await retryIfJestAssertionError(async () => {
const datasources = await datasourcesOverview.getDatasources();
const datasource = datasources.find(t => t.name === datasourceName);
expect(datasource).toBeDefined();
expect(datasource!.availability).toMatch(`Fully available (${numSegmentString})`);
expect(datasource!.totalRows).toBe(39244);
});
}