Merge remote-tracking branch 'upstream/master' into vectorize_earliest_num

This commit is contained in:
Soumyava Das 2023-08-07 09:51:42 -07:00
commit ccfd600c7a
3789 changed files with 1416481 additions and 18993 deletions

44
.github/scripts/setup_test_profiling_env.sh vendored Executable file
View File

@ -0,0 +1,44 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
set -e
JAR_INPUT_FILE="jfr-profiler-1.0.0.jar"
JAR_OUTPUT_FILE="jfr-profiler.jar"
ENV_VAR="JFR_PROFILER_ARG_LINE"
if [ "$#" -ne 5 ]; then
echo "usage: $0 <jdk_version> <run_id> <run_number> <run_attempt> <module>"
fi
if [[ "$1" == "17" ]];
then
curl https://static.imply.io/cp/$JAR_INPUT_FILE -s -o $JAR_OUTPUT_FILE
echo $ENV_VAR=-javaagent:"$PWD"/$JAR_OUTPUT_FILE \
-Djfr.profiler.http.username=druid-ci \
-Djfr.profiler.http.password=w3Fb6PW8LIo849mViEkbgA== \
-Djfr.profiler.tags.project=druid \
-Djfr.profiler.tags.run_id=$2 \
-Djfr.profiler.tags.run_number=$3 \
-Djfr.profiler.tags.run_attempt=$4 \
-Djfr.profiler.tags.module=$5
else
echo $ENV_VAR=\"\"
fi

View File

@ -21,7 +21,8 @@ unset _JAVA_OPTIONS
# Set MAVEN_OPTS for Surefire launcher.
MAVEN_OPTS='-Xmx2500m' ${MVN} test -pl ${MAVEN_PROJECTS} \
${MAVEN_SKIP} -Ddruid.generic.useDefaultValueForNull=${DRUID_USE_DEFAULT_VALUE_FOR_NULL}
${MAVEN_SKIP} -Ddruid.generic.useDefaultValueForNull=${DRUID_USE_DEFAULT_VALUE_FOR_NULL} \
-DjfrProfilerArgLine="${JFR_PROFILER_ARG_LINE}"
sh -c "dmesg | egrep -i '(oom|out of memory|kill process|killed).*' -C 1 || exit 0"
free -m
${MVN} -pl ${MAVEN_PROJECTS} jacoco:report || { echo "coverage_failure=false" >> "$GITHUB_ENV" && false; }

View File

@ -27,7 +27,7 @@ on:
jobs:
build:
if: github.event_name == 'schedule'
if: (github.event_name == 'schedule' && github.repository == 'apache/druid')
name: build (jdk8)
runs-on: ubuntu-latest
steps:
@ -107,10 +107,12 @@ jobs:
group: other
security_vulnerabilities:
if: github.repository == 'apache/druid'
name: security vulnerabilities
strategy:
fail-fast: false
matrix:
HADOOP_PROFILE: [ '', '-Phadoop3' ]
HADOOP_PROFILE: [ '', '-Phadoop2' ]
runs-on: ubuntu-latest
steps:
- name: Checkout branch
@ -123,9 +125,11 @@ jobs:
distribution: 'zulu'
cache: maven
- name: maven build # needed to rebuild incase of maven snapshot resolution fails
run: mvn clean install dependency:go-offline -P dist -P skip-static-checks,skip-tests -Dmaven.javadoc.skip=true -Dcyclonedx.skip=true -Dweb.console.skip=true
- name: security vulnerabilities check
env:
MVN: mvn --no-snapshot-updates
HADOOP_PROFILE: ${{ matrix.HADOOP_PROFILE }}
run: |
mvn dependency-check:purge dependency-check:check ${HADOOP_PROFILE} || { echo "

View File

@ -92,6 +92,9 @@ jobs:
if: ${{ failure() && steps.run-it.conclusion == 'failure' }}
run: |
for v in broker router ${{ inputs.use_indexer }} historical coordinator overlord; do
echo "------------------------druid-"$v"-------------------------";
sudo docker exec druid-"$v" tail -1000 /shared/logs/"$v".log;
echo "=======================druid-"$v"========================";
echo "-----------------------docker logs-----------------------";
sudo docker logs druid-"$v" 2>&1 | tail -1000 ||:;
echo "-----------------------service logs----------------------";
sudo docker exec druid-"$v" tail -1000 /shared/logs/"$v".log 2>&1 ||:;
done

View File

@ -86,6 +86,11 @@ jobs:
echo "DRUID_USE_DEFAULT_VALUE_FOR_NULL=true" >> $GITHUB_ENV
fi
- name: test profiling
run: |
./.github/scripts/setup_test_profiling_env.sh ${{ inputs.jdk }} ${{ github.run_id }} \
${{ github.run_number }} ${{ github.run_attempt }} ${{ inputs.module }} >> $GITHUB_ENV
- name: fetch base branch for test coverage
if: ${{ github.base_ref != '' }}
run: |

View File

@ -51,47 +51,8 @@ jobs:
days-before-close: 28
stale-issue-label: stale
stale-pr-label: stale
exempt-issue-labels: |
Security
Bug
Proposal
Design Review
Improvement
Performance
Refactoring
Apache
Area - Automation/Static Analysis
Area - Batch Indexing
Area - Cache
Area - Deep Storage
Area - Dependencies
Area - Dependency Injection
Area - Dev
Area - Documentation
Area - Extension
Area - Kafka/Kinesis Indexing
Area - Lookups
Area - Metadata
Area - Metrics/Event Emitting
Area - Null Handling
Area - Operations
Area - Query UI
Area - Querying
Area - Router
Area - Segment Balancing/Coordination
Area - Segment Format and Ser/De
Area - SQL
Area - Testing
Area - Web Console
Area - Zookeeper/Curator
Compatibility
Contributions Welcome
Development Blocker
Ease of Use
Error handling
HTTP
Incompatible
Stable API
exempt-issue-labels: 'Evergreen,Security,Bug,Proposal,Design Review,Improvement,Performance,Refactoring,Apache,Area - Automation/Static Analysis,Area - Batch Indexing,Area - Cache,Area - Deep Storage,Area - Dependencies,Area - Dependency Injection,Area - Dev,Area - Documentation,Area - Extension,Area - Kafka/Kinesis Indexing,Area - Lookups,Area - Metadata,Area - Metrics/Event Emitting,Area - Null Handling,Area - Operations,Area - Query UI,Area - Querying,Area - Router,Area - Segment Balancing/Coordination,Area - Segment Format and Ser/De,Area - SQL,Area - Testing,Area - Web Console,Area - Zookeeper/Curator,Compatibility,Contributions Welcome,Development Blocker,Ease of Use,Error handling,HTTP,Incompatible,Stable API'
exempt-pr-labels: 'Evergreen'
exempt-milestones: true
exempt-assignees: true
ascending: true

View File

@ -93,11 +93,11 @@ jobs:
strategy:
fail-fast: false
matrix:
jdk: [8, 11]
jdk: [8, 17]
uses: ./.github/workflows/reusable-standard-its.yml
if: ${{ needs.changes.outputs.core == 'true' || needs.changes.outputs.common-extensions == 'true' }}
with:
build_jdk: 8
build_jdk: ${{ matrix.jdk }}
runtime_jdk: ${{ matrix.jdk }}
testing_groups: -Dgroups=query
use_indexer: middleManager
@ -177,7 +177,7 @@ jobs:
run: |
for v in broker middlemanager router coordinator historical ; do
echo "------------------------druid-tiny-cluster-"$v"s-0-------------------------";
sudo /usr/local/bin/kubectl logs --tail 1000 druid-tiny-cluster-"$v"s-0;
sudo /usr/local/bin/kubectl logs --tail 1000 druid-tiny-cluster-"$v"s-0 ||:;
done
integration-other-tests:

View File

@ -69,9 +69,9 @@ jobs:
if: ${{ matrix.java == 'jdk8' }}
run: ./check_test_suite_test.py
- name: (openjdk11) strict compilation
if: ${{ matrix.java == 'jdk11' }}
# errorprone requires JDK 11
- name: (openjdk17) strict compilation
if: ${{ matrix.java == 'jdk17' }}
# errorprone requires JDK 11+
# Strict compilation requires more than 2 GB
run: ${MVN} clean -DstrictCompile compile test-compile --fail-at-end ${MAVEN_SKIP} ${MAVEN_SKIP_TESTS}
@ -160,9 +160,9 @@ jobs:
- name: checkout branch
uses: actions/checkout@v3
- name: setup JDK11
- name: setup JDK17
run: |
echo "JAVA_HOME=$JAVA_HOME_11_X64" >> $GITHUB_ENV
echo "JAVA_HOME=$JAVA_HOME_17_X64" >> $GITHUB_ENV
- name: setup node
uses: actions/setup-node@v3

View File

@ -285,6 +285,10 @@ SOURCE/JAVA-CORE
This product contains test cases adapted from Test Framework for Apache Drill (https://github.com/apache/drill-test-framework).
* sql/src/test/resources/drill/window
This product contains the class copied from https://github.com/FasterXML/jackson-databind
* extensions-core/s3-extensions/src/main/java/com/fasterxml/jackson/databind/PropertyNamingStrategies.java
MIT License
================================

View File

@ -17,8 +17,8 @@
~ under the License.
-->
[![Coverage Status](https://img.shields.io/codecov/c/gh/apache/druid)](https://codecov.io/gh/apache/druid)
[![Docker](https://img.shields.io/badge/container-docker-blue.svg)](https://hub.docker.com/r/apache/druid)
[![Coverage Status](https://img.shields.io/codecov/c/gh/apache/druid?logo=codecov)](https://codecov.io/gh/apache/druid)
[![Docker](https://img.shields.io/badge/container-docker-blue.svg?logo=docker)](https://hub.docker.com/r/apache/druid)
[![Helm](https://img.shields.io/badge/helm-druid-5F90AB?logo=helm)](https://github.com/apache/druid/blob/master/helm/druid/README.md)
<!--- Following badges are disabled until they can be fixed: -->
<!--- [![Inspections Status](https://img.shields.io/teamcity/http/teamcity.jetbrains.com/s/OpenSourceProjects_Druid_Inspections.svg?label=TeamCity%20inspections)](https://teamcity.jetbrains.com/viewType.html?buildTypeId=OpenSourceProjects_Druid_Inspections) -->

View File

@ -27,7 +27,7 @@
<parent>
<groupId>org.apache.druid</groupId>
<artifactId>druid</artifactId>
<version>27.0.0-SNAPSHOT</version>
<version>28.0.0-SNAPSHOT</version>
</parent>
<dependencies>

View File

@ -26,12 +26,12 @@ import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.collections.bitmap.RoaringBitmapFactory;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.segment.column.BitmapColumnIndex;
import org.apache.druid.segment.column.IndexedUtf8ValueSetIndex;
import org.apache.druid.segment.column.StringValueSetIndex;
import org.apache.druid.segment.data.BitmapSerdeFactory;
import org.apache.druid.segment.data.GenericIndexed;
import org.apache.druid.segment.data.RoaringBitmapSerdeFactory;
import org.apache.druid.segment.index.BitmapColumnIndex;
import org.apache.druid.segment.index.IndexedUtf8ValueIndexes;
import org.apache.druid.segment.index.semantic.StringValueSetIndexes;
import org.apache.druid.segment.serde.StringUtf8ColumnIndexSupplier;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
@ -72,7 +72,7 @@ public class DictionaryEncodedStringIndexSupplierBenchmark
public static class BenchmarkState
{
@Nullable
private IndexedUtf8ValueSetIndex<?> stringValueSetIndex;
private IndexedUtf8ValueIndexes<?> stringValueSetIndex;
private final TreeSet<ByteBuffer> values = new TreeSet<>();
private static final int START_INT = 10_000_000;
@ -112,7 +112,7 @@ public class DictionaryEncodedStringIndexSupplierBenchmark
);
StringUtf8ColumnIndexSupplier<?> indexSupplier =
new StringUtf8ColumnIndexSupplier<>(bitmapFactory, dictionaryUtf8::singleThreaded, bitmaps, null);
stringValueSetIndex = (IndexedUtf8ValueSetIndex<?>) indexSupplier.as(StringValueSetIndex.class);
stringValueSetIndex = (IndexedUtf8ValueIndexes<?>) indexSupplier.as(StringValueSetIndexes.class);
List<Integer> filterValues = new ArrayList<>();
List<Integer> nonFilterValues = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {

View File

@ -88,10 +88,10 @@ public class FrontCodedIndexedBenchmark
@Param({
"generic",
"front-coded-4",
"front-coded-16",
"front-coded-incremental-buckets-4",
"front-coded-incremental-buckets-16"
"front-coded-v0-4",
"front-coded-v0-16",
"front-coded-v1-4",
"front-coded-v1-16"
})
public String indexType;
@ -138,7 +138,7 @@ public class FrontCodedIndexedBenchmark
FrontCodedIndexedWriter frontCodedIndexedWriter = new FrontCodedIndexedWriter(
new OnHeapMemorySegmentWriteOutMedium(),
ByteOrder.nativeOrder(),
"front-coded-4".equals(indexType) ? 4 : 16,
"front-coded-v0-4".equals(indexType) ? 4 : 16,
FrontCodedIndexed.V0
);
frontCodedIndexedWriter.open();
@ -146,7 +146,7 @@ public class FrontCodedIndexedBenchmark
FrontCodedIndexedWriter frontCodedIndexedWriterIncrementalBuckets = new FrontCodedIndexedWriter(
new OnHeapMemorySegmentWriteOutMedium(),
ByteOrder.nativeOrder(),
"front-coded-incremental-buckets-4".equals(indexType) ? 4 : 16,
"front-coded-v1-4".equals(indexType) ? 4 : 16,
FrontCodedIndexed.V1
);
frontCodedIndexedWriterIncrementalBuckets.open();
@ -166,11 +166,11 @@ public class FrontCodedIndexedBenchmark
fileGeneric = File.createTempFile("genericIndexedBenchmark", "meta");
smooshDirFrontCodedIncrementalBuckets = FileUtils.createTempDir();
fileFrontCodedIncrementalBuckets = File.createTempFile("frontCodedIndexedBenchmarkIncrementalBuckets", "meta");
fileFrontCodedIncrementalBuckets = File.createTempFile("frontCodedIndexedBenchmarkv1Buckets", "meta");
EncodingSizeProfiler.encodedSize = (int) ("generic".equals(indexType)
? genericIndexedWriter.getSerializedSize()
: indexType.startsWith("front-coded-incremental-buckets")
: indexType.startsWith("front-coded-v1")
? frontCodedIndexedWriterIncrementalBuckets.getSerializedSize()
: frontCodedIndexedWriter.getSerializedSize());
try (
@ -286,7 +286,7 @@ public class FrontCodedIndexedBenchmark
}
if ("generic".equals(indexType)) {
indexed = genericIndexed.singleThreaded();
} else if (indexType.startsWith("front-coded-incremental-buckets")) {
} else if (indexType.startsWith("front-coded-v1")) {
indexed = frontCodedIndexedIncrementalBuckets;
} else {
indexed = frontCodedIndexed;

View File

@ -59,7 +59,7 @@ import org.apache.druid.query.DefaultQueryRunnerFactoryConglomerate;
import org.apache.druid.query.DruidProcessingConfig;
import org.apache.druid.query.Druids;
import org.apache.druid.query.FinalizeResultsQueryRunner;
import org.apache.druid.query.FluentQueryRunnerBuilder;
import org.apache.druid.query.FluentQueryRunner;
import org.apache.druid.query.Query;
import org.apache.druid.query.QueryContexts;
import org.apache.druid.query.QueryPlus;
@ -480,8 +480,11 @@ public class CachingClusteredClientBenchmark
private <T> List<T> runQuery()
{
//noinspection unchecked
QueryRunner<T> theRunner = new FluentQueryRunnerBuilder<T>(toolChestWarehouse.getToolChest(query))
.create(cachingClusteredClient.getQueryRunnerForIntervals(query, query.getIntervals()))
QueryRunner<T> theRunner = FluentQueryRunner
.create(
cachingClusteredClient.getQueryRunnerForIntervals(query, query.getIntervals()),
toolChestWarehouse.getToolChest(query)
)
.applyPreMergeDecoration()
.mergeResults()
.applyPostMergeDecoration();

View File

@ -27,7 +27,7 @@ import org.apache.druid.server.coordination.DruidServerMetadata;
import org.apache.druid.server.coordination.ServerType;
import org.apache.druid.server.coordinator.balancer.BalancerSegmentHolder;
import org.apache.druid.server.coordinator.balancer.ReservoirSegmentSampler;
import org.apache.druid.server.coordinator.loading.LoadQueuePeonTester;
import org.apache.druid.server.coordinator.loading.TestLoadQueuePeon;
import org.apache.druid.timeline.DataSegment;
import org.joda.time.Interval;
import org.openjdk.jmh.annotations.Benchmark;
@ -105,7 +105,7 @@ public class BalancerStrategyBenchmark
ImmutableMap.of("test", new ImmutableDruidDataSource("test", Collections.emptyMap(), segments)),
segments.size()
),
new LoadQueuePeonTester()
new TestLoadQueuePeon()
)
);
}

View File

@ -28,7 +28,7 @@
<parent>
<groupId>org.apache.druid</groupId>
<artifactId>druid</artifactId>
<version>27.0.0-SNAPSHOT</version>
<version>28.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@ -28,7 +28,7 @@
<parent>
<groupId>org.apache.druid</groupId>
<artifactId>druid</artifactId>
<version>27.0.0-SNAPSHOT</version>
<version>28.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@ -258,7 +258,7 @@ It is also the release managers responsibility for correctly assigning all PRs m
| [get-milestone-contributors](bin/get-milestone-contributors.py) | lists github users who contributed to a milestone |
| [get-milestone-prs](bin/get-milestone-prs.py) | lists PRs between tags or commits and the milestone associated with them. |
| [tag-missing-milestones](bin/tag-missing-milestones.py) | Find pull requests which the milestone is missing and tag them properly. |
| [find-missing-backports](bin/find-missing-backports.py) | Find PRs which have been back-ported to one release branch but not another. Useful if a bug fix release based on the previous release is required during a release cycle. |
| [find-missing-backports](bin/find-missing-backports.py) | Find PRs which have been back-ported to one release branch but not another. Useful if a bug fix release based on the previous release is required during a release cycle. Make sure to fetch remote commits before running this command. |
| [make-linkable-release-notes](bin/make-linkable-release-notes.py) | given input of a version, input markdown file path, and output markdown file path, will rewrite markdown headers of the input file to have embedded links in the release notes style. |

View File

@ -22,37 +22,34 @@ import subprocess
import sys
pr_number_pattern = r'\(#(\d+)\)'
backport_pattern = r'\[Backport[^\]]*\]'
def extract_pr_title_from_commit_message(commit_msg):
# Extract commit message except the pr number
commit_msg = re.sub(backport_pattern, '', commit_msg)
pr_num_pos = commit_msg.find("(#")
if pr_num_pos < 0:
pr_num_pos = len(commit_msg)
backport_pos = commit_msg.find("[Backport]")
if backport_pos < 0:
backport_pos = 0
else:
backport_pos = backport_pos + len("[Backport]")
return commit_msg[backport_pos:pr_num_pos].strip()
return commit_msg[:pr_num_pos].strip()
def extract_pr_numbers_from_commit_message(commit_msg):
extracted_numbers = re.findall(pr_number_pattern, commit_msg)
return extracted_numbers
def extract_pr_title(pr_json):
commit_url = pr_json['commits_url']
resp = requests.get(commit_url, auth=(github_username, os.environ["GIT_TOKEN"]))
title_candidates = [extract_pr_title_from_commit_message(pr_json['title'])]
if len(resp.json()) == 1:
title_candidates.append(extract_pr_title_from_commit_message(resp.json()[0]['commit']['message']))
return title_candidates
def find_missing_backports(pr_jsons, release_pr_subjects):
def find_missing_backports(pr_jsons, release_pr_subjects, release_pr_numbers):
for pr in pr_jsons:
if pr['milestone'] is not None:
if pr['milestone']['number'] == milestone_number:
for pr_title_candidate in extract_pr_title(pr):
if pr_title_candidate in release_pr_subjects:
return
print("Missing backport found for PR {}, url: {}".format(pr['number'], pr['html_url']))
backport_found = False
for label in pr['labels']:
if label['name'] == 'Backport':
backport_found = True
pr_title_candidate = extract_pr_title_from_commit_message(pr['title'])
if pr_title_candidate in release_pr_subjects:
backport_found = True
if str(pr['number']) in release_pr_numbers:
backport_found = True
if backport_found == False:
print("Missing backport found for PR {}, url: {}".format(pr['number'], pr['html_url']))
def find_next_url(links):
for link in links:
@ -95,15 +92,33 @@ command = "git log --pretty=tformat:%s {}..{}".format(previous_branch_first_comm
all_release_commits = subprocess.check_output(command, shell=True).decode('UTF-8')
release_pr_subjects = set()
release_pr_numbers = set()
for commit_msg in all_release_commits.splitlines():
title = extract_pr_title_from_commit_message(commit_msg)
pr_numbers = extract_pr_numbers_from_commit_message(commit_msg)
release_pr_subjects.add(title)
release_pr_numbers.update(pr_numbers)
print("Number of release PR subjects: {}".format(len(release_pr_subjects)))
# Get all closed PRs and filter out with milestone
next_url = "https://api.github.com/repos/apache/druid/pulls?state=closed"
while next_url is not None:
resp = requests.get(next_url, auth=(github_username, os.environ["GIT_TOKEN"]))
find_missing_backports(resp.json(), release_pr_subjects)
links = resp.headers['Link'].split(',')
next_url = find_next_url(links)
milestone_url = "https://api.github.com/repos/apache/druid/milestones/{}".format(milestone_number)
resp = requests.get(milestone_url, auth=(github_username, os.environ["GIT_TOKEN"])).json()
milestone_title = resp['title']
pr_items = []
page = 0
while True:
page = page + 1
pr_url = "https://api.github.com/search/issues?per_page=50&page={}&q=milestone:{}+type:pr+is:merged+is:closed+repo:apache/druid".format(page,milestone_title)
pr_resp = requests.get(pr_url, auth=(github_username, os.environ["GIT_TOKEN"])).json()
if pr_resp['incomplete_results']:
sys.stderr.write('This script cannot handle incomplete results')
sys.exit(1)
pr_items.extend(pr_resp['items'])
if len(pr_resp['items']) < 50:
print("Total PRs for current milestone: {}".format(len(pr_items)))
print("Total expected count: {}".format(pr_resp['total_count']))
if pr_resp['total_count'] != len(pr_items):
sys.stderr.write('Expected PR count does not match with number of PRs fetched')
sys.exit(1)
break
find_missing_backports(pr_items, release_pr_subjects, release_pr_numbers)

View File

@ -51,7 +51,7 @@ services:
- ZOO_MY_ID=1
coordinator:
image: apache/druid:27.0.0
image: apache/druid:28.0.0
container_name: coordinator
volumes:
- druid_shared:/opt/shared
@ -67,7 +67,7 @@ services:
- environment
broker:
image: apache/druid:27.0.0
image: apache/druid:28.0.0
container_name: broker
volumes:
- broker_var:/opt/druid/var
@ -83,7 +83,7 @@ services:
- environment
historical:
image: apache/druid:27.0.0
image: apache/druid:28.0.0
container_name: historical
volumes:
- druid_shared:/opt/shared
@ -100,7 +100,7 @@ services:
- environment
middlemanager:
image: apache/druid:27.0.0
image: apache/druid:28.0.0
container_name: middlemanager
volumes:
- druid_shared:/opt/shared
@ -118,7 +118,7 @@ services:
- environment
router:
image: apache/druid:27.0.0
image: apache/druid:28.0.0
container_name: router
volumes:
- router_var:/opt/druid/var

View File

@ -152,4 +152,4 @@ fi
# take the ${TASK_JSON} environment variable and base64 decode, unzip and throw it in ${TASK_DIR}/task.json
mkdir -p ${TASK_DIR}; echo ${TASK_JSON} | base64 -d | gzip -d > ${TASK_DIR}/task.json;
exec java ${JAVA_OPTS} -cp $COMMON_CONF_DIR:$SERVICE_CONF_DIR:lib/*: org.apache.druid.cli.Main internal peon $@
exec bin/run-java ${JAVA_OPTS} -cp $COMMON_CONF_DIR:$SERVICE_CONF_DIR:lib/*: org.apache.druid.cli.Main internal peon $@

View File

@ -30,7 +30,7 @@
<parent>
<artifactId>druid</artifactId>
<groupId>org.apache.druid</groupId>
<version>27.0.0-SNAPSHOT</version>
<version>28.0.0-SNAPSHOT</version>
</parent>
<dependencies>
@ -633,6 +633,8 @@
<argument>org.apache.druid.extensions.contrib:aliyun-oss-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions.contrib:opentelemetry-emitter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-iceberg-extensions</argument>
</arguments>
</configuration>
</execution>

View File

@ -86,7 +86,7 @@
<includes>
<include>*</include>
</includes>
<fileMode>744</fileMode>
<fileMode>755</fileMode>
<outputDirectory>bin</outputDirectory>
</fileSet>
</fileSets>

View File

@ -23,14 +23,876 @@ sidebar_label: JSON querying
~ under the License.
-->
This document describes the API endpoints to submit JSON-based [native queries](../querying/querying.md) to Apache Druid.
This topic describes the API endpoints to submit JSON-based [native queries](../querying/querying.md) to Apache Druid.
## Queries
In this topic, `http://SERVICE_IP:SERVICE_PORT` is a placeholder for the server address of deployment and the service port. For example, on the quickstart configuration, replace `http://ROUTER_IP:ROUTER_PORT` with `http://localhost:8888`.
`POST /druid/v2/`
The endpoint for submitting queries. Accepts an option `?pretty` that pretty prints the results.
## Submit a query
`POST /druid/v2/candidates/`
Submits a JSON-based native query. The body of the request is the native query itself.
Returns segment information lists including server locations for the given query.
Druid supports different types of queries for different use cases. All queries require the following properties:
* `queryType`: A string representing the type of query. Druid supports the following native query types: `timeseries`, `topN`, `groupBy`, `timeBoundaries`, `segmentMetadata`, `datasourceMetadata`, `scan`, and `search`.
* `dataSource`: A string or object defining the source of data to query. The most common value is the name of the datasource to query. For more information, see [Datasources](../querying/datasource.md).
For additional properties based on your query type or use case, see [available native queries](../querying/querying.md#available-queries).
### URL
<code class="postAPI">POST</code> <code>/druid/v2/</code>
### Query parameters
* `pretty` (optional)
* Druid returns the response in a pretty-printed format using indentation and line breaks.
### Responses
<!--DOCUSAURUS_CODE_TABS-->
<!--200 SUCCESS-->
*Successfully submitted query*
<!--400 BAD REQUEST-->
*Error thrown due to bad query. Returns a JSON object detailing the error with the following format:*
```json
{
"error": "A well-defined error code.",
"errorMessage": "A message with additional details about the error.",
"errorClass": "Class of exception that caused this error.",
"host": "The host on which the error occurred."
}
```
For more information on possible error messages, see [query execution failures](../querying/querying.md#query-execution-failures).
<!--END_DOCUSAURUS_CODE_TABS-->
---
### Example query: `topN`
The following example shows a `topN` query. The query analyzes the `social_media` datasource to return the top five users from the `username` dimension with the highest number of views from the `views` metric.
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl "http://ROUTER_IP:ROUTER_PORT/druid/v2?pretty=null" \
--header 'Content-Type: application/json' \
--data '{
"queryType": "topN",
"dataSource": "social_media",
"dimension": "username",
"threshold": 5,
"metric": "views",
"granularity": "all",
"aggregations": [
{
"type": "longSum",
"name": "views",
"fieldName": "views"
}
],
"intervals": [
"2022-01-01T00:00:00.000/2024-01-01T00:00:00.000"
]
}'
```
<!--HTTP-->
```HTTP
POST /druid/v2?pretty=null HTTP/1.1
Host: http://ROUTER_IP:ROUTER_PORT
Content-Type: application/json
Content-Length: 336
{
"queryType": "topN",
"dataSource": "social_media",
"dimension": "username",
"threshold": 5,
"metric": "views",
"granularity": "all",
"aggregations": [
{
"type": "longSum",
"name": "views",
"fieldName": "views"
}
],
"intervals": [
"2022-01-01T00:00:00.000/2024-01-01T00:00:00.000"
]
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### Example response: `topN`
<details>
<summary>Click to show sample response</summary>
```json
[
{
"timestamp": "2023-07-03T18:49:54.848Z",
"result": [
{
"views": 11591218026,
"username": "gus"
},
{
"views": 11578638578,
"username": "miette"
},
{
"views": 11561618880,
"username": "leon"
},
{
"views": 11552609824,
"username": "mia"
},
{
"views": 11551537517,
"username": "milton"
}
]
}
]
```
</details>
### Example query: `groupBy`
The following example submits a JSON query of the `groupBy` type to retrieve the `username` with the highest votes to posts ratio from the `social_media` datasource.
In this query:
* The `upvoteSum` aggregation calculates the sum of the `upvotes` for each user.
* The `postCount` aggregation calculates the sum of posts for each user.
* The `upvoteToPostRatio` is a post-aggregation of the `upvoteSum` and the `postCount`, divided to calculate the ratio.
* The result is sorted based on the `upvoteToPostRatio` in descending order.
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl "http://ROUTER_IP:ROUTER_PORT/druid/v2" \
--header 'Content-Type: application/json' \
--data '{
"queryType": "groupBy",
"dataSource": "social_media",
"dimensions": ["username"],
"granularity": "all",
"aggregations": [
{ "type": "doubleSum", "name": "upvoteSum", "fieldName": "upvotes" },
{ "type": "count", "name": "postCount", "fieldName": "post_title" }
],
"postAggregations": [
{
"type": "arithmetic",
"name": "upvoteToPostRatio",
"fn": "/",
"fields": [
{ "type": "fieldAccess", "name": "upvoteSum", "fieldName": "upvoteSum" },
{ "type": "fieldAccess", "name": "postCount", "fieldName": "postCount" }
]
}
],
"intervals": ["2022-01-01T00:00:00.000/2024-01-01T00:00:00.000"],
"limitSpec": {
"type": "default",
"limit": 1,
"columns": [
{ "dimension": "upvoteToPostRatio", "direction": "descending" }
]
}
}'
```
<!--HTTP-->
```HTTP
POST /druid/v2?pretty=null HTTP/1.1
Host: http://ROUTER_IP:ROUTER_PORT
Content-Type: application/json
Content-Length: 817
{
"queryType": "groupBy",
"dataSource": "social_media",
"dimensions": ["username"],
"granularity": "all",
"aggregations": [
{ "type": "doubleSum", "name": "upvoteSum", "fieldName": "upvotes" },
{ "type": "count", "name": "postCount", "fieldName": "post_title" }
],
"postAggregations": [
{
"type": "arithmetic",
"name": "upvoteToPostRatio",
"fn": "/",
"fields": [
{ "type": "fieldAccess", "name": "upvoteSum", "fieldName": "upvoteSum" },
{ "type": "fieldAccess", "name": "postCount", "fieldName": "postCount" }
]
}
],
"intervals": ["2022-01-01T00:00:00.000/2024-01-01T00:00:00.000"],
"limitSpec": {
"type": "default",
"limit": 1,
"columns": [
{ "dimension": "upvoteToPostRatio", "direction": "descending" }
]
}
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### Example response: `groupBy`
<details>
<summary>Click to show sample response</summary>
```json
[
{
"version": "v1",
"timestamp": "2022-01-01T00:00:00.000Z",
"event": {
"upvoteSum": 8.0419541E7,
"upvoteToPostRatio": 69.53014661762697,
"postCount": 1156614,
"username": "miette"
}
}
]
```
</details>
## Get segment information for query
Retrieves an array that contains objects with segment information, including the server locations associated with the query provided in the request body.
### URL
<code class="postAPI">POST</code> <code>/druid/v2/candidates/</code>
### Query parameters
* `pretty` (optional)
* Druid returns the response in a pretty-printed format using indentation and line breaks.
### Responses
<!--DOCUSAURUS_CODE_TABS-->
<!--200 SUCCESS-->
*Successfully retrieved segment information*
<!--400 BAD REQUEST-->
*Error thrown due to bad query. Returns a JSON object detailing the error with the following format:*
```json
{
"error": "A well-defined error code.",
"errorMessage": "A message with additional details about the error.",
"errorClass": "Class of exception that caused this error.",
"host": "The host on which the error occurred."
}
```
For more information on possible error messages, see [query execution failures](../querying/querying.md#query-execution-failures).
<!--END_DOCUSAURUS_CODE_TABS-->
---
### Sample request
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl "http://ROUTER_IP:ROUTER_PORT/druid/v2/candidates" \
--header 'Content-Type: application/json' \
--data '{
"queryType": "topN",
"dataSource": "social_media",
"dimension": "username",
"threshold": 5,
"metric": "views",
"granularity": "all",
"aggregations": [
{
"type": "longSum",
"name": "views",
"fieldName": "views"
}
],
"intervals": [
"2022-01-01T00:00:00.000/2024-01-01T00:00:00.000"
]
}'
```
<!--HTTP-->
```HTTP
POST /druid/v2/candidates HTTP/1.1
Host: http://ROUTER_IP:ROUTER_PORT
Content-Type: application/json
Content-Length: 336
{
"queryType": "topN",
"dataSource": "social_media",
"dimension": "username",
"threshold": 5,
"metric": "views",
"granularity": "all",
"aggregations": [
{
"type": "longSum",
"name": "views",
"fieldName": "views"
}
],
"intervals": [
"2020-01-01T00:00:00.000/2024-01-01T00:00:00.000"
]
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
### Sample response
<details>
<summary>Click to show sample response</summary>
```json
[
{
"interval": "2023-07-03T18:00:00.000Z/2023-07-03T19:00:00.000Z",
"version": "2023-07-03T18:51:18.905Z",
"partitionNumber": 0,
"size": 21563693,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-03T19:00:00.000Z/2023-07-03T20:00:00.000Z",
"version": "2023-07-03T19:00:00.657Z",
"partitionNumber": 0,
"size": 6057236,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-05T21:00:00.000Z/2023-07-05T22:00:00.000Z",
"version": "2023-07-05T21:09:58.102Z",
"partitionNumber": 0,
"size": 223926186,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-05T21:00:00.000Z/2023-07-05T22:00:00.000Z",
"version": "2023-07-05T21:09:58.102Z",
"partitionNumber": 1,
"size": 20244827,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-05T22:00:00.000Z/2023-07-05T23:00:00.000Z",
"version": "2023-07-05T22:00:00.524Z",
"partitionNumber": 0,
"size": 104628051,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-05T22:00:00.000Z/2023-07-05T23:00:00.000Z",
"version": "2023-07-05T22:00:00.524Z",
"partitionNumber": 1,
"size": 1603995,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-05T23:00:00.000Z/2023-07-06T00:00:00.000Z",
"version": "2023-07-05T23:21:55.242Z",
"partitionNumber": 0,
"size": 181506843,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T00:00:00.000Z/2023-07-06T01:00:00.000Z",
"version": "2023-07-06T00:02:08.498Z",
"partitionNumber": 0,
"size": 9170974,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T00:00:00.000Z/2023-07-06T01:00:00.000Z",
"version": "2023-07-06T00:02:08.498Z",
"partitionNumber": 1,
"size": 23969632,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T01:00:00.000Z/2023-07-06T02:00:00.000Z",
"version": "2023-07-06T01:13:53.982Z",
"partitionNumber": 0,
"size": 599895,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T01:00:00.000Z/2023-07-06T02:00:00.000Z",
"version": "2023-07-06T01:13:53.982Z",
"partitionNumber": 1,
"size": 1627041,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T02:00:00.000Z/2023-07-06T03:00:00.000Z",
"version": "2023-07-06T02:55:50.701Z",
"partitionNumber": 0,
"size": 629753,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T02:00:00.000Z/2023-07-06T03:00:00.000Z",
"version": "2023-07-06T02:55:50.701Z",
"partitionNumber": 1,
"size": 1342360,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T04:00:00.000Z/2023-07-06T05:00:00.000Z",
"version": "2023-07-06T04:02:36.562Z",
"partitionNumber": 0,
"size": 2131434,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T05:00:00.000Z/2023-07-06T06:00:00.000Z",
"version": "2023-07-06T05:23:27.856Z",
"partitionNumber": 0,
"size": 797161,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T05:00:00.000Z/2023-07-06T06:00:00.000Z",
"version": "2023-07-06T05:23:27.856Z",
"partitionNumber": 1,
"size": 1176858,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T06:00:00.000Z/2023-07-06T07:00:00.000Z",
"version": "2023-07-06T06:46:34.638Z",
"partitionNumber": 0,
"size": 2148760,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T07:00:00.000Z/2023-07-06T08:00:00.000Z",
"version": "2023-07-06T07:38:28.050Z",
"partitionNumber": 0,
"size": 2040748,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T08:00:00.000Z/2023-07-06T09:00:00.000Z",
"version": "2023-07-06T08:27:31.407Z",
"partitionNumber": 0,
"size": 678723,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T08:00:00.000Z/2023-07-06T09:00:00.000Z",
"version": "2023-07-06T08:27:31.407Z",
"partitionNumber": 1,
"size": 1437866,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T10:00:00.000Z/2023-07-06T11:00:00.000Z",
"version": "2023-07-06T10:02:42.079Z",
"partitionNumber": 0,
"size": 1671296,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T11:00:00.000Z/2023-07-06T12:00:00.000Z",
"version": "2023-07-06T11:27:23.902Z",
"partitionNumber": 0,
"size": 574893,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T11:00:00.000Z/2023-07-06T12:00:00.000Z",
"version": "2023-07-06T11:27:23.902Z",
"partitionNumber": 1,
"size": 1427384,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T12:00:00.000Z/2023-07-06T13:00:00.000Z",
"version": "2023-07-06T12:52:00.846Z",
"partitionNumber": 0,
"size": 2115172,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T14:00:00.000Z/2023-07-06T15:00:00.000Z",
"version": "2023-07-06T14:32:33.926Z",
"partitionNumber": 0,
"size": 589108,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T14:00:00.000Z/2023-07-06T15:00:00.000Z",
"version": "2023-07-06T14:32:33.926Z",
"partitionNumber": 1,
"size": 1392649,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T15:00:00.000Z/2023-07-06T16:00:00.000Z",
"version": "2023-07-06T15:53:25.467Z",
"partitionNumber": 0,
"size": 2037851,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T16:00:00.000Z/2023-07-06T17:00:00.000Z",
"version": "2023-07-06T16:02:26.568Z",
"partitionNumber": 0,
"size": 230400650,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T16:00:00.000Z/2023-07-06T17:00:00.000Z",
"version": "2023-07-06T16:02:26.568Z",
"partitionNumber": 1,
"size": 38209056,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
},
{
"interval": "2023-07-06T17:00:00.000Z/2023-07-06T18:00:00.000Z",
"version": "2023-07-06T17:00:02.391Z",
"partitionNumber": 0,
"size": 211099463,
"locations": [
{
"name": "localhost:8083",
"host": "localhost:8083",
"hostAndTlsPort": null,
"maxSize": 300000000000,
"type": "historical",
"tier": "_default_tier",
"priority": 0
}
]
}
]
```
</details>

File diff suppressed because it is too large Load Diff

View File

@ -186,4 +186,815 @@ Druid returns an HTTP 404 response in the following cases:
- `sqlQueryId` is incorrect.
- The query completes before your cancellation request is processed.
Druid returns an HTTP 403 response for authorization failure.
Druid returns an HTTP 403 response for authorization failure.
## Query from deep storage
> Query from deep storage is an [experimental feature](../development/experimental.md).
You can use the `sql/statements` endpoint to query segments that exist only in deep storage and are not loaded onto your Historical processes as determined by your load rules.
Note that at least one segment of a datasource must be available on a Historical process so that the Broker can plan your query. A quick way to check if this is true is whether or not a datasource is visible in the Druid console.
For more information, see [Query from deep storage](../querying/query-from-deep-storage.md).
### Submit a query
Submit a query for data stored in deep storage. Any data ingested into Druid is placed into deep storage. The query is contained in the "query" field in the JSON object within the request payload.
Note that at least part of a datasource must be available on a Historical process so that Druid can plan your query and only the user who submits a query can see the results.
#### URL
<code class="postAPI">POST</code> <code>/druid/v2/sql/statements</code>
#### Request body
Generally, the `sql` and `sql/statements` endpoints support the same response body fields with minor differences. For general information about the available fields, see [Submit a query to the `sql` endpoint](#submit-a-query).
Keep the following in mind when submitting queries to the `sql/statements` endpoint:
- There are additional context parameters for `sql/statements`:
- `executionMode` determines how query results are fetched. Druid currently only supports `ASYNC`. You must manually retrieve your results after the query completes.
- `selectDestination` determines where final results get written. By default, results are written to task reports. Set this parameter to `durableStorage` to instruct Druid to write the results from SELECT queries to durable storage, which allows you to fetch larger result sets. Note that this requires you to have [durable storage for MSQ enabled](../operations/durable-storage.md).
- The only supported value for `resultFormat` is JSON LINES.
#### Responses
<!--DOCUSAURUS_CODE_TABS-->
<!--200 SUCCESS-->
*Successfully queried from deep storage*
<!--400 BAD REQUEST-->
*Error thrown due to bad query. Returns a JSON object detailing the error with the following format:*
```json
{
"error": "Summary of the encountered error.",
"errorClass": "Class of exception that caused this error.",
"host": "The host on which the error occurred.",
"errorCode": "Well-defined error code.",
"persona": "Role or persona associated with the error.",
"category": "Classification of the error.",
"errorMessage": "Summary of the encountered issue with expanded information.",
"context": "Additional context about the error."
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
---
#### Sample request
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl "http://ROUTER_IP:ROUTER_PORT/druid/v2/sql/statements" \
--header 'Content-Type: application/json' \
--data '{
"query": "SELECT * FROM wikipedia WHERE user='\''BlueMoon2662'\''",
"context": {
"executionMode":"ASYNC"
}
}'
```
<!--HTTP-->
```HTTP
POST /druid/v2/sql/statements HTTP/1.1
Host: http://ROUTER_IP:ROUTER_PORT
Content-Type: application/json
Content-Length: 134
{
"query": "SELECT * FROM wikipedia WHERE user='BlueMoon2662'",
"context": {
"executionMode":"ASYNC"
}
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### Sample response
<details>
<summary>Click to show sample response</summary>
```json
{
"queryId": "query-b82a7049-b94f-41f2-a230-7fef94768745",
"state": "ACCEPTED",
"createdAt": "2023-07-26T21:16:25.324Z",
"schema": [
{
"name": "__time",
"type": "TIMESTAMP",
"nativeType": "LONG"
},
{
"name": "channel",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "cityName",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "comment",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "countryIsoCode",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "countryName",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "isAnonymous",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isMinor",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isNew",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isRobot",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isUnpatrolled",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "metroCode",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "namespace",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "page",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "regionIsoCode",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "regionName",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "user",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "delta",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "added",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "deleted",
"type": "BIGINT",
"nativeType": "LONG"
}
],
"durationMs": -1
}
```
</details>
### Get query status
Retrieves information about the query associated with the given query ID. The response matches the response from the POST API if the query is accepted or running and the execution mode is `ASYNC`. In addition to the fields that this endpoint shares with `POST /sql/statements`, a completed query's status includes the following:
- A `result` object that summarizes information about your results, such as the total number of rows and sample records.
- A `pages` object that includes the following information for each page of results:
- `numRows`: the number of rows in that page of results.
- `sizeInBytes`: the size of the page.
- `id`: the page number that you can use to reference a specific page when you get query results.
#### URL
<code class="getAPI">GET</code> <code>/druid/v2/sql/statements/:queryId</code>
#### Responses
<!--DOCUSAURUS_CODE_TABS-->
<!--200 SUCCESS-->
*Successfully retrieved query status*
<!--400 BAD REQUEST-->
*Error thrown due to bad query. Returns a JSON object detailing the error with the following format:*
```json
{
"error": "Summary of the encountered error.",
"errorCode": "Well-defined error code.",
"persona": "Role or persona associated with the error.",
"category": "Classification of the error.",
"errorMessage": "Summary of the encountered issue with expanded information.",
"context": "Additional context about the error."
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### Sample request
The following example retrieves the status of a query with specified ID `query-9b93f6f7-ab0e-48f5-986a-3520f84f0804`.
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl "http://ROUTER_IP:ROUTER_PORT/druid/v2/sql/statements/query-9b93f6f7-ab0e-48f5-986a-3520f84f0804"
```
<!--HTTP-->
```HTTP
GET /druid/v2/sql/statements/query-9b93f6f7-ab0e-48f5-986a-3520f84f0804 HTTP/1.1
Host: http://ROUTER_IP:ROUTER_PORT
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### Sample response
<details>
<summary>Click to show sample response</summary>
```json
{
"queryId": "query-9b93f6f7-ab0e-48f5-986a-3520f84f0804",
"state": "SUCCESS",
"createdAt": "2023-07-26T22:57:46.620Z",
"schema": [
{
"name": "__time",
"type": "TIMESTAMP",
"nativeType": "LONG"
},
{
"name": "channel",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "cityName",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "comment",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "countryIsoCode",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "countryName",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "isAnonymous",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isMinor",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isNew",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isRobot",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "isUnpatrolled",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "metroCode",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "namespace",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "page",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "regionIsoCode",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "regionName",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "user",
"type": "VARCHAR",
"nativeType": "STRING"
},
{
"name": "delta",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "added",
"type": "BIGINT",
"nativeType": "LONG"
},
{
"name": "deleted",
"type": "BIGINT",
"nativeType": "LONG"
}
],
"durationMs": 25591,
"result": {
"numTotalRows": 1,
"totalSizeInBytes": 375,
"dataSource": "__query_select",
"sampleRecords": [
[
1442018873259,
"#ja.wikipedia",
"",
"/* 対戦通算成績と得失点 */",
"",
"",
0,
1,
0,
0,
0,
0,
"Main",
"アルビレックス新潟の年度別成績一覧",
"",
"",
"BlueMoon2662",
14,
14,
0
]
],
"pages": [
{
"id": 0,
"numRows": 1,
"sizeInBytes": 375
}
]
}
}
```
</details>
### Get query results
Retrieves results for completed queries. Results are separated into pages, so you can use the optional `page` parameter to refine the results you get. Druid returns information about the composition of each page and its page number (`id`). For information about pages, see [Get query status](#get-query-status).
If a page number isn't passed, all results are returned sequentially in the same response. If you have large result sets, you may encounter timeouts based on the value configured for `druid.router.http.readTimeout`.
When getting query results, keep the following in mind:
- JSON Lines is the only supported result format.
- Getting the query results for an ingestion query returns an empty response.
#### URL
<code class="getAPI">GET</code> <code>/druid/v2/sql/statements/:queryId/results</code>
#### Query parameters
* `page`
* Int (optional)
* Refine paginated results
#### Responses
<!--DOCUSAURUS_CODE_TABS-->
<!--200 SUCCESS-->
*Successfully retrieved query results*
<!--400 BAD REQUEST-->
*Query in progress. Returns a JSON object detailing the error with the following format:*
```json
{
"error": "Summary of the encountered error.",
"errorCode": "Well-defined error code.",
"persona": "Role or persona associated with the error.",
"category": "Classification of the error.",
"errorMessage": "Summary of the encountered issue with expanded information.",
"context": "Additional context about the error."
}
```
<!--404 NOT FOUND-->
*Query not found, failed or canceled*
<!--500 SERVER ERROR-->
*Error thrown due to bad query. Returns a JSON object detailing the error with the following format:*
```json
{
"error": "Summary of the encountered error.",
"errorCode": "Well-defined error code.",
"persona": "Role or persona associated with the error.",
"category": "Classification of the error.",
"errorMessage": "Summary of the encountered issue with expanded information.",
"context": "Additional context about the error."
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
---
#### Sample request
The following example retrieves the status of a query with specified ID `query-f3bca219-173d-44d4-bdc7-5002e910352f`.
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl "http://ROUTER_IP:ROUTER_PORT/druid/v2/sql/statements/query-f3bca219-173d-44d4-bdc7-5002e910352f/results"
```
<!--HTTP-->
```HTTP
GET /druid/v2/sql/statements/query-f3bca219-173d-44d4-bdc7-5002e910352f/results HTTP/1.1
Host: http://ROUTER_IP:ROUTER_PORT
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### Sample response
<details>
<summary>Click to show sample response</summary>
```json
[
{
"__time": 1442018818771,
"channel": "#en.wikipedia",
"cityName": "",
"comment": "added project",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 0,
"isNew": 0,
"isRobot": 0,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Talk",
"page": "Talk:Oswald Tilghman",
"regionIsoCode": "",
"regionName": "",
"user": "GELongstreet",
"delta": 36,
"added": 36,
"deleted": 0
},
{
"__time": 1442018820496,
"channel": "#ca.wikipedia",
"cityName": "",
"comment": "Robot inserta {{Commonscat}} que enllaça amb [[commons:category:Rallicula]]",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 1,
"isNew": 0,
"isRobot": 1,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Main",
"page": "Rallicula",
"regionIsoCode": "",
"regionName": "",
"user": "PereBot",
"delta": 17,
"added": 17,
"deleted": 0
},
{
"__time": 1442018825474,
"channel": "#en.wikipedia",
"cityName": "Auburn",
"comment": "/* Status of peremptory norms under international law */ fixed spelling of 'Wimbledon'",
"countryIsoCode": "AU",
"countryName": "Australia",
"isAnonymous": 1,
"isMinor": 0,
"isNew": 0,
"isRobot": 0,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Main",
"page": "Peremptory norm",
"regionIsoCode": "NSW",
"regionName": "New South Wales",
"user": "60.225.66.142",
"delta": 0,
"added": 0,
"deleted": 0
},
{
"__time": 1442018828770,
"channel": "#vi.wikipedia",
"cityName": "",
"comment": "fix Lỗi CS1: ngày tháng",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 1,
"isNew": 0,
"isRobot": 1,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Main",
"page": "Apamea abruzzorum",
"regionIsoCode": "",
"regionName": "",
"user": "Cheers!-bot",
"delta": 18,
"added": 18,
"deleted": 0
},
{
"__time": 1442018831862,
"channel": "#vi.wikipedia",
"cityName": "",
"comment": "clean up using [[Project:AWB|AWB]]",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 0,
"isNew": 0,
"isRobot": 1,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Main",
"page": "Atractus flammigerus",
"regionIsoCode": "",
"regionName": "",
"user": "ThitxongkhoiAWB",
"delta": 18,
"added": 18,
"deleted": 0
},
{
"__time": 1442018833987,
"channel": "#vi.wikipedia",
"cityName": "",
"comment": "clean up using [[Project:AWB|AWB]]",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 0,
"isNew": 0,
"isRobot": 1,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Main",
"page": "Agama mossambica",
"regionIsoCode": "",
"regionName": "",
"user": "ThitxongkhoiAWB",
"delta": 18,
"added": 18,
"deleted": 0
},
{
"__time": 1442018837009,
"channel": "#ca.wikipedia",
"cityName": "",
"comment": "/* Imperi Austrohongarès */",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 0,
"isNew": 0,
"isRobot": 0,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Main",
"page": "Campanya dels Balcans (1914-1918)",
"regionIsoCode": "",
"regionName": "",
"user": "Jaumellecha",
"delta": -20,
"added": 0,
"deleted": 20
},
{
"__time": 1442018839591,
"channel": "#en.wikipedia",
"cityName": "",
"comment": "adding comment on notability and possible COI",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 0,
"isNew": 1,
"isRobot": 0,
"isUnpatrolled": 1,
"metroCode": 0,
"namespace": "Talk",
"page": "Talk:Dani Ploeger",
"regionIsoCode": "",
"regionName": "",
"user": "New Media Theorist",
"delta": 345,
"added": 345,
"deleted": 0
},
{
"__time": 1442018841578,
"channel": "#en.wikipedia",
"cityName": "",
"comment": "Copying assessment table to wiki",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 0,
"isNew": 0,
"isRobot": 1,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "User",
"page": "User:WP 1.0 bot/Tables/Project/Pubs",
"regionIsoCode": "",
"regionName": "",
"user": "WP 1.0 bot",
"delta": 121,
"added": 121,
"deleted": 0
},
{
"__time": 1442018845821,
"channel": "#vi.wikipedia",
"cityName": "",
"comment": "clean up using [[Project:AWB|AWB]]",
"countryIsoCode": "",
"countryName": "",
"isAnonymous": 0,
"isMinor": 0,
"isNew": 0,
"isRobot": 1,
"isUnpatrolled": 0,
"metroCode": 0,
"namespace": "Main",
"page": "Agama persimilis",
"regionIsoCode": "",
"regionName": "",
"user": "ThitxongkhoiAWB",
"delta": 18,
"added": 18,
"deleted": 0
}
]
```
</details>
### Cancel a query
Cancels a running or accepted query.
#### URL
<code class="deleteAPI">DELETE</code> <code>/druid/v2/sql/statements/:queryId</code>
#### Responses
<!--DOCUSAURUS_CODE_TABS-->
<!--200 OK-->
*A no op operation since the query is not in a state to be cancelled*
<!--202 ACCEPTED-->
*Successfully accepted query for cancellation*
<!--404 SERVER ERROR-->
*Invalid query ID. Returns a JSON object detailing the error with the following format:*
```json
{
"error": "Summary of the encountered error.",
"errorCode": "Well-defined error code.",
"persona": "Role or persona associated with the error.",
"category": "Classification of the error.",
"errorMessage": "Summary of the encountered issue with expanded information.",
"context": "Additional context about the error."
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
---
#### Sample request
The following example cancels a query with specified ID `query-945c9633-2fa2-49ab-80ae-8221c38c024da`.
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl --request DELETE "http://ROUTER_IP:ROUTER_PORT/druid/v2/sql/statements/query-945c9633-2fa2-49ab-80ae-8221c38c024da"
```
<!--HTTP-->
```HTTP
DELETE /druid/v2/sql/statements/query-945c9633-2fa2-49ab-80ae-8221c38c024da HTTP/1.1
Host: http://ROUTER_IP:ROUTER_PORT
```
<!--END_DOCUSAURUS_CODE_TABS-->
#### Sample response
A successful request returns a `202 ACCEPTED` response and an empty response.

View File

@ -108,7 +108,7 @@ For more information about the connection options, see [Client Reference](https:
Make sure you meet the following requirements before trying these examples:
- A supported Java version, such as Java 8
- A supported [Java version](../operations/java.md)
- [Avatica JDBC driver](https://calcite.apache.org/avatica/downloads/). You can add the JAR to your `CLASSPATH` directly or manage it externally, such as through Maven and a `pom.xml` file.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 72 KiB

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 97 KiB

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 270 KiB

After

Width:  |  Height:  |  Size: 262 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 110 KiB

After

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 81 KiB

After

Width:  |  Height:  |  Size: 85 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 77 KiB

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 134 KiB

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 191 KiB

After

Width:  |  Height:  |  Size: 197 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 92 KiB

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

After

Width:  |  Height:  |  Size: 65 KiB

View File

@ -887,7 +887,7 @@ These Coordinator static configurations can be defined in the `coordinator/runti
|Property|Possible Values|Description|Default|
|--------|---------------|-----------|-------|
|`druid.serverview.type`|batch or http|Segment discovery method to use. "http" enables discovering segments using HTTP instead of ZooKeeper.|http|
|`druid.coordinator.loadqueuepeon.type`|curator or http|Whether to use "http" or "curator" implementation to assign segment loads/drops to historical|http|
|`druid.coordinator.loadqueuepeon.type`|curator or http|Implementation to use to assign segment loads and drops to historicals. Curator-based implementation is now deprecated, so you should transition to using HTTP-based segment assignments.|http|
|`druid.coordinator.segment.awaitInitializationOnStart`|true or false|Whether the Coordinator will wait for its view of segments to fully initialize before starting up. If set to 'true', the Coordinator's HTTP server will not start up, and the Coordinator will not announce itself as available, until the server view is initialized.|true|
###### Additional config when "http" loadqueuepeon is used
@ -907,7 +907,7 @@ These Coordinator static configurations can be defined in the `coordinator/runti
#### Dynamic Configuration
The Coordinator has dynamic configuration to change certain behavior on the fly.
The Coordinator has dynamic configurations to tune certain behavior on the fly, without requiring a service restart.
It is recommended that you use the [web console](../operations/web-console.md) to configure these parameters.
However, if you need to do it via HTTP, the JSON object can be submitted to the Coordinator via a POST request at:
@ -949,10 +949,11 @@ Issuing a GET request at the same URL will return the spec that is currently in
|`millisToWaitBeforeDeleting`|How long does the Coordinator need to be a leader before it can start marking overshadowed segments as unused in metadata storage.|900000 (15 mins)|
|`mergeBytesLimit`|The maximum total uncompressed size in bytes of segments to merge.|524288000L|
|`mergeSegmentsLimit`|The maximum number of segments that can be in a single [append task](../ingestion/tasks.md).|100|
|`smartSegmentLoading`|Enables ["smart" segment loading mode](#smart-segment-loading) which dynamically computes the optimal values of several properties that maximize Coordinator performance.|true|
|`maxSegmentsToMove`|The maximum number of segments that can be moved at any given time.|100|
|`replicantLifetime`|The maximum number of Coordinator runs for a segment to be replicated before we start alerting.|15|
|`replicationThrottleLimit`|The maximum number of segments that can be in the replication queue of a historical tier at any given time.|500|
|`balancerComputeThreads`|Thread pool size for computing moving cost of segments in segment balancing. Consider increasing this if you have a lot of segments and moving segments starts to get stuck.|1|
|`replicantLifetime`|The maximum number of Coordinator runs for which a segment can wait in the load queue of a Historical before Druid raises an alert.|15|
|`replicationThrottleLimit`|The maximum number of segment replicas that can be assigned to a historical tier in a single Coordinator run. This property prevents historicals from becoming overwhelmed when loading extra replicas of segments that are already available in the cluster.|500|
|`balancerComputeThreads`|Thread pool size for computing moving cost of segments during segment balancing. Consider increasing this if you have a lot of segments and moving segments begins to stall.|1|
|`killDataSourceWhitelist`|List of specific data sources for which kill tasks are sent if property `druid.coordinator.kill.on` is true. This can be a list of comma-separated data source names or a JSON array.|none|
|`killPendingSegmentsSkipList`|List of data sources for which pendingSegments are _NOT_ cleaned up if property `druid.coordinator.kill.pendingSegments.on` is true. This can be a list of comma-separated data sources or a JSON array.|none|
|`maxSegmentsInNodeLoadingQueue`|The maximum number of segments allowed in the load queue of any given server. Use this parameter to load segments faster if, for example, the cluster contains slow-loading nodes or if there are too many segments to be replicated to a particular node (when faster loading is preferred to better segments distribution). The optimal value depends on the loading speed of segments, acceptable replication time and number of nodes. |500|
@ -961,9 +962,29 @@ Issuing a GET request at the same URL will return the spec that is currently in
|`decommissioningMaxPercentOfMaxSegmentsToMove`| Upper limit of segments the Coordinator can move from decommissioning servers to active non-decommissioning servers during a single run. This value is relative to the total maximum number of segments that can be moved at any given time based upon the value of `maxSegmentsToMove`.<br /><br />If `decommissioningMaxPercentOfMaxSegmentsToMove` is 0, the Coordinator does not move segments to decommissioning servers, effectively putting them in a type of "maintenance" mode. In this case, decommissioning servers do not participate in balancing or assignment by load rules. The Coordinator still considers segments on decommissioning servers as candidates to replicate on active servers.<br /><br />Decommissioning can stall if there are no available active servers to move the segments to. You can use the maximum percent of decommissioning segment movements to prioritize balancing or to decrease commissioning time to prevent active servers from being overloaded. The value must be between 0 and 100.|70|
|`pauseCoordination`| Boolean flag for whether or not the coordinator should execute its various duties of coordinating the cluster. Setting this to true essentially pauses all coordination work while allowing the API to remain up. Duties that are paused include all classes that implement the `CoordinatorDuty` Interface. Such duties include: Segment balancing, Segment compaction, Submitting kill tasks for unused segments (if enabled), Logging of used segments in the cluster, Marking of newly unused or overshadowed segments, Matching and execution of load/drop rules for used segments, Unloading segments that are no longer marked as used from Historical servers. An example of when an admin may want to pause coordination would be if they are doing deep storage maintenance on HDFS Name Nodes with downtime and don't want the coordinator to be directing Historical Nodes to hit the Name Node with API requests until maintenance is done and the deep store is declared healthy for use again. |false|
|`replicateAfterLoadTimeout`| Boolean flag for whether or not additional replication is needed for segments that have failed to load due to the expiry of `druid.coordinator.load.timeout`. If this is set to true, the coordinator will attempt to replicate the failed segment on a different historical server. This helps improve the segment availability if there are a few slow historicals in the cluster. However, the slow historical may still load the segment later and the coordinator may issue drop requests if the segment is over-replicated.|false|
|`maxNonPrimaryReplicantsToLoad`|This is the maximum number of non-primary segment replicants to load per Coordination run. This number can be set to put a hard upper limit on the number of replicants loaded. It is a tool that can help prevent long delays in new data being available for query after events that require many non-primary replicants to be loaded by the cluster; such as a Historical node disconnecting from the cluster. The default value essentially means there is no limit on the number of replicants loaded per coordination cycle. If you want to use a non-default value for this config, you may want to start with it being `~20%` of the number of segments found on your Historical server with the most segments. You can use the Druid metric, `coordinator/time` with the filter `duty=org.apache.druid.server.coordinator.duty.RunRules` to see how different values of this config impact your Coordinator execution time.|`Integer.MAX_VALUE`|
|`maxNonPrimaryReplicantsToLoad`|The maximum number of replicas that can be assigned across all tiers in a single Coordinator run. This parameter serves the same purpose as `replicationThrottleLimit` except this limit applies at the cluster-level instead of per tier. The default value does not apply a limit to the number of replicas assigned per coordination cycle. If you want to use a non-default value for this property, you may want to start with `~20%` of the number of segments found on the historical server with the most segments. Use the Druid metric, `coordinator/time` with the filter `duty=org.apache.druid.server.coordinator.duty.RunRules` to see how different values of this property impact your Coordinator execution time.|`Integer.MAX_VALUE` (no limit)|
##### Smart segment loading
The `smartSegmentLoading` mode simplifies Coordinator configuration for segment loading and balancing.
If you enable this mode, do not provide values for the properties in the table below as the Coordinator computes them automatically.
Druid computes the values to optimize Coordinator performance, based on the current state of the cluster.
> If you enable `smartSegmentLoading` mode, Druid ignores any value you provide for the following properties.
|Property|Computed value|Description|
|--------|--------------|-----------|
|`useRoundRobinSegmentAssignment`|true|Speeds up segment assignment.|
|`maxSegmentsInNodeLoadingQueue`|0|Removes the limit on load queue size.|
|`replicationThrottleLimit`|2% of used segments, minimum value 100|Prevents aggressive replication when a historical disappears only intermittently.|
|`replicantLifetime`|60|Allows segments to wait about an hour (assuming a Coordinator period of 1 minute) in the load queue before an alert is raised. In `smartSegmentLoading` mode, load queues are not limited by size. Segments might therefore assigned to a load queue even if the corresponding server is slow to load them.|
|`maxNonPrimaryReplicantsToLoad`|`Integer.MAX_VALUE` (no limit)|This throttling is already handled by `replicationThrottleLimit`.|
|`maxSegmentsToMove`|2% of used segments, minimum value 100, maximum value 1000|Ensures that some segments are always moving in the cluster to keep it well balanced. The maximum value keeps the Coordinator run times bounded.|
|`decommissioningMaxPercentOfMaxSegmentsToMove`|100|Prioritizes the move of segments from decommissioning servers so that they can be terminated quickly.|
When `smartSegmentLoading` is disabled, Druid uses the configured values of these properties.
Disable `smartSegmentLoading` only if you want to explicitly set the values of any of the above properties.
##### Audit history
To view the audit history of Coordinator dynamic config issue a GET request to the URL -
```
@ -1887,7 +1908,7 @@ client has the following configuration options.
|`druid.broker.http.compressionCodec`|Compression codec the Broker uses to communicate with Historical and real-time processes. May be "gzip" or "identity".|`gzip`|
|`druid.broker.http.readTimeout`|The timeout for data reads from Historical servers and real-time tasks.|`PT15M`|
|`druid.broker.http.unusedConnectionTimeout`|The timeout for idle connections in connection pool. The connection in the pool will be closed after this timeout and a new one will be established. This timeout should be less than `druid.broker.http.readTimeout`. Set this timeout = ~90% of `druid.broker.http.readTimeout`|`PT4M`|
|`druid.broker.http.maxQueuedBytes`|Maximum number of bytes queued per query before exerting backpressure on channels to the data servers.<br /><br />Similar to `druid.server.http.maxScatterGatherBytes`, except unlike that configuration, this one will trigger backpressure rather than query failure. Zero means disabled. Can be overridden by the ["maxQueuedBytes" query context parameter](../querying/query-context.md). Human-readable format is supported, see [here](human-readable-byte.md). |`25MB` or 2% of maximum Broker heap size, whichever is greater|
|`druid.broker.http.maxQueuedBytes`|Maximum number of bytes queued per query before exerting [backpressure](../operations/basic-cluster-tuning.md#broker-backpressure) on channels to the data servers.<br /><br />Similar to `druid.server.http.maxScatterGatherBytes`, except that `maxQueuedBytes` triggers [backpressure](../operations/basic-cluster-tuning.md#broker-backpressure) instead of query failure. Set to zero to disable. You can override this setting by using the [`maxQueuedBytes` query context parameter](../querying/query-context.md). Druid supports [human-readable](human-readable-byte.md) format. |`25MB` or 2% of maximum Broker heap size, whichever is greater.|
|`druid.broker.http.numMaxThreads`|`Maximum number of I/O worker threads|max(10, ((number of cores * 17) / 16 + 2) + 30)`|
##### Retry Policy
@ -1992,7 +2013,7 @@ You can optionally only configure caching to be enabled on the Broker by setting
See [cache configuration](#cache-configuration) for how to configure cache settings.
> Note: Even if cache is enabled, for [groupBy v2](../querying/groupbyquery.md#strategies) queries, both of non-result level cache and result level cache do not work on Brokers.
> Note: Even if cache is enabled, for [groupBy v2](../querying/groupbyquery.md#strategies) queries, segment level cache do not work on Brokers.
> See [Differences between v1 and v2](../querying/groupbyquery.md#differences-between-v1-and-v2) and [Query caching](../querying/caching.md) for more information.
#### Segment Discovery

View File

@ -134,19 +134,6 @@ Java runtime itself.
This file is not rotated, but it is generally small due to the low volume of messages.
If necessary, you can truncate it using the Linux command `truncate --size 0 log/historical.stdout.log`.
## Avoid reflective access warnings in logs
On Java 11, you may see warnings like the following in the logs:
```
WARNING: An illegal reflective access operation has occurred
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
```
To avoid these, add the `--add-exports` and `--add-opens` command line parameters described in the documentation section
about [Java strong encapsulation](../operations/java.md#strong-encapsulation).
## Set the logs to asynchronously write
If your logs are really chatty, you can set them to write asynchronously.

View File

@ -95,9 +95,20 @@ The available grammar is:
"id": <task_id>,
"dataSource": <task_datasource>,
"interval" : <all_unused_segments_in_this_interval_will_die!>,
"context": <task context>
"context": <task context>,
"batchSize": <optional_batch size>,
"limit": <the maximum number of segments to delete>
}
```
Some of the parameters used in the task payload are further explained below:
| Parameter | Default | Explanation |
|-------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `batchSize` |100 | Maximum number of segments that are deleted in one kill batch. Some operations on the Overlord may get stuck while a `kill` task is in progress due to concurrency constraints (such as in `TaskLockbox`). Thus, a `kill` task splits the list of unused segments to be deleted into smaller batches to yield the Overlord resources intermittently to other task operations.|
| `limit` | null - no limit | Maximum number of segments for the kill task to delete.|
**WARNING:** The `kill` task permanently removes all information about the affected segments from the metadata store and
deep storage. This operation cannot be undone.

View File

@ -70,12 +70,20 @@ Druid uses deep storage to store any data that has been ingested into the system
storage accessible by every Druid server. In a clustered deployment, this is typically a distributed object store like S3 or
HDFS, or a network mounted filesystem. In a single-server deployment, this is typically local disk.
Druid uses deep storage only as a backup of your data and as a way to transfer data in the background between
Druid processes. Druid stores data in files called _segments_. Historical processes cache data segments on
local disk and serve queries from that cache as well as from an in-memory cache.
This means that Druid never needs to access deep storage
during a query, helping it offer the best query latencies possible. It also means that you must have enough disk space
both in deep storage and across your Historical servers for the data you plan to load.
Druid uses deep storage for the following purposes:
- To store all the data you ingest. Segments that get loaded onto Historical processes for low latency queries are also kept in deep storage for backup purposes. Additionally, segments that are only in deep storage can be used for [queries from deep storage](../querying/query-from-deep-storage.md).
- As a way to transfer data in the background between Druid processes. Druid stores data in files called _segments_.
Historical processes cache data segments on local disk and serve queries from that cache as well as from an in-memory cache.
Segments on disk for Historical processes provide the low latency querying performance Druid is known for.
You can also query directly from deep storage. When you query segments that exist only in deep storage, you trade some performance for the ability to query more of your data without necessarily having to scale your Historical processes.
When determining sizing for your storage, keep the following in mind:
- Deep storage needs to be able to hold all the data that you ingest into Druid.
- On disk storage for Historical processes need to be able to accommodate the data you want to load onto them to run queries. The data on Historical processes should be data you access frequently and need to run low latency queries for.
Deep storage is an important part of Druid's elastic, fault-tolerant design. Druid bootstraps from deep storage even
if every single data server is lost and re-provisioned.
@ -210,8 +218,7 @@ available before they are published, since they are only published when the segm
any additional rows of data.
2. **Deep storage:** Segment data files are pushed to deep storage once a segment is done being constructed. This
happens immediately before publishing metadata to the metadata store.
3. **Availability for querying:** Segments are available for querying on some Druid data server, like a realtime task
or a Historical process.
3. **Availability for querying:** Segments are available for querying on some Druid data server, like a realtime task, directly from deep storage, or a Historical process.
You can inspect the state of currently active segments using the Druid SQL
[`sys.segments` table](../querying/sql-metadata-tables.md#segments-table). It includes the following flags:

View File

@ -23,9 +23,15 @@ title: "Deep storage"
-->
Deep storage is where segments are stored. It is a storage mechanism that Apache Druid does not provide. This deep storage infrastructure defines the level of durability of your data, as long as Druid processes can see this storage infrastructure and get at the segments stored on it, you will not lose data no matter how many Druid nodes you lose. If segments disappear from this storage layer, then you will lose whatever data those segments represented.
Deep storage is where segments are stored. It is a storage mechanism that Apache Druid does not provide. This deep storage infrastructure defines the level of durability of your data. As long as Druid processes can see this storage infrastructure and get at the segments stored on it, you will not lose data no matter how many Druid nodes you lose. If segments disappear from this storage layer, then you will lose whatever data those segments represented.
## Local
In addition to being the backing store for segments, you can use [query from deep storage](#querying-from-deep-storage) and run queries against segments stored primarily in deep storage. The [load rules](../operations/rule-configuration.md#load-rules) you configure determine whether segments exist primarily in deep storage or in a combination of deep storage and Historical processes.
## Deep storage options
Druid supports multiple options for deep storage, including blob storage from major cloud providers. Select the one that fits your environment.
### Local
Local storage is intended for use in the following situations:
@ -55,22 +61,28 @@ druid.storage.storageDirectory=/tmp/druid/localStorage
The `druid.storage.storageDirectory` must be set to a different path than `druid.segmentCache.locations` or
`druid.segmentCache.infoDir`.
## Amazon S3 or S3-compatible
### Amazon S3 or S3-compatible
See [`druid-s3-extensions`](../development/extensions-core/s3.md).
## Google Cloud Storage
### Google Cloud Storage
See [`druid-google-extensions`](../development/extensions-core/google.md).
## Azure Blob Storage
### Azure Blob Storage
See [`druid-azure-extensions`](../development/extensions-core/azure.md).
## HDFS
### HDFS
See [druid-hdfs-storage extension documentation](../development/extensions-core/hdfs.md).
## Additional options
### Additional options
For additional deep storage options, please see our [extensions list](../configuration/extensions.md).
## Querying from deep storage
Although not as performant as querying segments stored on disk for Historical processes, you can query from deep storage to access segments that you may not need frequently or with the extreme low latency Druid queries traditionally provide. You trade some performance for a total lower storage cost because you can access more of your data without the need to increase the number or capacity of your Historical processes.
For information about how to run queries, see [Query from deep storage](../querying/query-from-deep-storage.md).

View File

@ -36,6 +36,8 @@ Common application areas for Druid include:
- Digital marketing/advertising analytics
- Business intelligence/OLAP
If you are experimenting with a new use case for Druid or have questions about Druid's capabilities and features, join the [Apache Druid Slack](http://apachedruidworkspace.slack.com/) channel. There, you can connect with Druid experts, ask questions, and get help in real time.
## Key features of Druid
Druid's core architecture combines ideas from data warehouses, timeseries databases, and logsearch systems. Some of

View File

@ -0,0 +1,117 @@
---
id: iceberg
title: "Iceberg extension"
---
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
## Iceberg Ingest extension
Apache Iceberg is an open table format for huge analytic datasets. [IcebergInputSource](../../ingestion/input-sources.md#iceberg-input-source) lets you ingest data stored in the Iceberg table format into Apache Druid. To use the iceberg extension, add the `druid-iceberg-extensions` to the list of loaded extensions. See [Loading extensions](../../configuration/extensions.md#loading-extensions) for more information.
Iceberg manages most of its metadata in metadata files in the object storage. However, it is still dependent on a metastore to manage a certain amount of metadata.
Iceberg refers to these metastores as catalogs. The Iceberg extension lets you connect to the following Iceberg catalog types:
* Hive metastore catalog
* Local catalog
Druid does not support AWS Glue and REST based catalogs yet.
For a given catalog, Iceberg input source reads the table name from the catalog, applies the filters, and extracts all the underlying live data files up to the latest snapshot.
The data files can be in Parquet, ORC, or Avro formats. The data files typically reside in a warehouse location, which can be in HDFS, S3, or the local filesystem.
The `druid-iceberg-extensions` extension relies on the existing input source connectors in Druid to read the data files from the warehouse. Therefore, the Iceberg input source can be considered as an intermediate input source, which provides the file paths for other input source implementations.
## Hive metastore catalog
For Druid to seamlessly talk to the Hive metastore, ensure that the Hive configuration files such as `hive-site.xml` and `core-site.xml` are available in the Druid classpath for peon processes.
You can also specify Hive properties under the `catalogProperties` object in the ingestion spec.
The `druid-iceberg-extensions` extension presently only supports HDFS, S3 and local warehouse directories.
### Read from HDFS warehouse
To read from a HDFS warehouse, load the `druid-hdfs-storage` extension. Druid extracts data file paths from the Hive metastore catalog and uses [HDFS input source](../../ingestion/input-sources.md#hdfs-input-source) to ingest these files.
The `warehouseSource` type in the ingestion spec should be `hdfs`.
For authenticating with Kerberized clusters, include `principal` and `keytab` properties in the `catalogProperties` object:
```json
"catalogProperties": {
"principal": "krb_principal",
"keytab": "/path/to/keytab"
}
```
Only Kerberos based authentication is supported as of now.
### Read from S3 warehouse
To read from a S3 warehouse, load the `druid-s3-extensions` extension. Druid extracts the data file paths from the Hive metastore catalog and uses `S3InputSource` to ingest these files.
Set the `type` property of the `warehouseSource` object to `s3` in the ingestion spec. If the S3 endpoint for the warehouse is different from the endpoint configured as the deep storage, include the following properties in the `warehouseSource` object to define the S3 endpoint settings:
```json
"warehouseSource": {
"type": "s3",
"endpointConfig": {
"url": "S3_ENDPOINT_URL",
"signingRegion": "us-east-1"
},
"clientConfig": {
"protocol": "http",
"disableChunkedEncoding": true,
"enablePathStyleAccess": true,
"forceGlobalBucketAccessEnabled": false
},
"properties": {
"accessKeyId": {
"type": "default",
"password": "<ACCESS_KEY_ID"
},
"secretAccessKey": {
"type": "default",
"password": "<SECRET_ACCESS_KEY>"
}
}
}
```
This extension uses the [Hadoop AWS module](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/) to connect to S3 and retrieve the metadata and data file paths.
The following properties are required in the `catalogProperties`:
```json
"catalogProperties": {
"fs.s3a.access.key" : "S3_ACCESS_KEY",
"fs.s3a.secret.key" : "S3_SECRET_KEY",
"fs.s3a.endpoint" : "S3_API_ENDPOINT"
}
```
Since the Hadoop AWS connector uses the `s3a` filesystem client, specify the warehouse path with the `s3a://` protocol instead of `s3://`.
## Local catalog
The local catalog type can be used for catalogs configured on the local filesystem. Set the `icebergCatalog` type to `local`. You can use this catalog for demos or localized tests. It is not recommended for production use cases.
The `warehouseSource` is set to `local` because this catalog only supports reading from a local filesystem.
## Known limitations
This section lists the known limitations that apply to the Iceberg extension.
- This extension does not fully utilize the Iceberg features such as snapshotting or schema evolution.
- The Iceberg input source reads every single live file on the Iceberg table up to the latest snapshot, which makes the table scan less performant. It is recommended to use Iceberg filters on partition columns in the ingestion spec in order to limit the number of data files being retrieved. Since, Druid doesn't store the last ingested iceberg snapshot ID, it cannot identify the files created between that snapshot and the latest snapshot on Iceberg.
- It does not handle Iceberg [schema evolution](https://iceberg.apache.org/docs/latest/evolution/) yet. In cases where an existing Iceberg table column is deleted and recreated with the same name, ingesting this table into Druid may bring the data for this column before it was deleted.
- The Hive catalog has not been tested on Hadoop 2.x.x and is not guaranteed to work with Hadoop 2.

View File

@ -28,73 +28,33 @@ Consider this an [EXPERIMENTAL](../experimental.md) feature mostly because it ha
## How it works
The K8s extension builds a pod spec using the specified pod adapter, the default implementation takes the podSpec of your `Overlord` pod and creates a kubernetes job from this podSpec. Thus if you have sidecars such as Splunk or Istio it can optionally launch a task as a K8s job. All jobs are natively restorable, they are decoupled from the druid deployment, thus restarting pods or doing upgrades has no affect on tasks in flight. They will continue to run and when the overlord comes back up it will start tracking them again.
The K8s extension builds a pod spec for each task using the specified pod adapter. All jobs are natively restorable, they are decoupled from the Druid deployment, thus restarting pods or doing upgrades has no affect on tasks in flight. They will continue to run and when the overlord comes back up it will start tracking them again.
## Pod Adapters
The logic defining how the pod template is built for your kubernetes job depends on which pod adapter you have specified.
### Overlord Single Container Pod Adapter
The overlord single container pod adapter takes the podSpec of your `Overlord` pod and creates a kubernetes job from this podSpec. This is the default pod adapter implementation, to explicitly enable it you can specify the runtime property `druid.indexer.runner.k8s.adapter.type: overlordSingleContainer`
### Overlord Multi Container Pod Adapter
The overlord multi container pod adapter takes the podSpec of your `Overlord` pod and creates a kubernetes job from this podSpec. It uses kubexit to manage dependency ordering between the main container that runs your druid peon and other sidecars defined in the `Overlord` pod spec. To enable this pod adapter you can specify the runtime property `druid.indexer.runner.k8s.adapter.type: overlordMultiContainer`
### Custom Template Pod Adapter
The custom template pod adapter allows you to specify a pod template file per task type. This adapter requires you to specify a `base` pod spec which will be used in the case that a task specific pod spec has not been defined. To enable this pod adapter you can specify the runtime property `druid.indexer.runner.k8s.adapter.type: customTemplateAdapter`
The base pod template must be specified as the runtime property `druid.indexer.runner.k8s.podTemplate.base: /path/to/basePodSpec.yaml`
Task specific pod templates must be specified as the runtime property `druid.indexer.runner.k8s.podTemplate.{taskType}: /path/to/taskSpecificPodSpec.yaml` where {taskType} is the name of the task type i.e `index_parallel`
## Configuration
To use this extension please make sure to [include](../../configuration/extensions.md#loading-extensions)`druid-kubernetes-overlord-extensions` in the extensions load list for your overlord process.
The extension uses the task queue to limit how many concurrent tasks (K8s jobs) are in flight so it is required you have a reasonable value for `druid.indexer.queue.maxSize`. Additionally set the variable `druid.indexer.runner.namespace` to the namespace in which you are running druid.
The extension uses `druid.indexer.runner.capacity` to limit the number of k8s jobs in flight. A good initial value for this would be the sum of the total task slots of all the middle managers you were running before switching to K8s based ingestion. The K8s task runner uses one thread per Job that is created, so setting this number too large can cause memory issues on the overlord. Additionally set the variable `druid.indexer.runner.namespace` to the namespace in which you are running druid.
Other configurations required are:
Other configurations required are:
`druid.indexer.runner.type: k8s` and `druid.indexer.task.encapsulatedTask: true`
You can add optional labels to your K8s jobs / pods if you need them by using the following configuration:
`druid.indexer.runner.labels: '{"key":"value"}'`
Annotations are the same with:
`druid.indexer.runner.annotations: '{"key":"value"}'`
## Pod Adapters
The logic defining how the pod template is built for your Kubernetes Job depends on which pod adapter you have specified.
All other configurations you had for the middle manager tasks must be moved under the overlord with one caveat, you must specify javaOpts as an array:
`druid.indexer.runner.javaOptsArray`, `druid.indexer.runner.javaOpts` is no longer supported.
### Overlord Single Container Pod Adapter/Overlord Multi Container Pod Adapter
The overlord single container pod adapter takes the podSpec of your `Overlord` pod and creates a kubernetes job from this podSpec. This is the default pod adapter implementation, to explicitly enable it you can specify the runtime property `druid.indexer.runner.k8s.adapter.type: overlordSingleContainer`
If you are running without a middle manager you need to also use `druid.processing.intermediaryData.storage.type=deepstore`
The overlord multi container pod adapter takes the podSpec of your `Overlord` pod and creates a kubernetes job from this podSpec. It uses kubexit to manage dependency ordering between the main container that runs your druid peon and other sidecars defined in the `Overlord` pod spec. Thus if you have sidecars such as Splunk or Istio it will be able to handle them. To enable this pod adapter you can specify the runtime property `druid.indexer.runner.k8s.adapter.type: overlordMultiContainer`
Additional Configuration
For the sidecar support to work for the multi container pod adapter, your entry point / command in docker must be explicitly defined your spec.
### Properties
|Property|Possible Values|Description|Default|required|
|--------|---------------|-----------|-------|--------|
|`druid.indexer.runner.debugJobs`|`boolean`|Clean up K8s jobs after tasks complete.|False|No|
|`druid.indexer.runner.sidecarSupport`|`boolean`|Deprecated, specify adapter type as runtime property `druid.indexer.runner.k8s.adapter.type: overlordMultiContainer` instead. If your overlord pod has sidecars, this will attempt to start the task with the same sidecars as the overlord pod.|False|No|
|`druid.indexer.runner.primaryContainerName`|`String`|If running with sidecars, the `primaryContainerName` should be that of your druid container like `druid-overlord`.|First container in `podSpec` list|No|
|`druid.indexer.runner.kubexitImage`|`String`|Used kubexit project to help shutdown sidecars when the main pod completes. Otherwise jobs with sidecars never terminate.|karlkfi/kubexit:latest|No|
|`druid.indexer.runner.disableClientProxy`|`boolean`|Use this if you have a global http(s) proxy and you wish to bypass it.|false|No|
|`druid.indexer.runner.maxTaskDuration`|`Duration`|Max time a task is allowed to run for before getting killed|`PT4H`|No|
|`druid.indexer.runner.taskCleanupDelay`|`Duration`|How long do jobs stay around before getting reaped from K8s|`P2D`|No|
|`druid.indexer.runner.taskCleanupInterval`|`Duration`|How often to check for jobs to be reaped|`PT10M`|No|
|`druid.indexer.runner.K8sjobLaunchTimeout`|`Duration`|How long to wait to launch a K8s task before marking it as failed, on a resource constrained cluster it may take some time.|`PT1H`|No|
|`druid.indexer.runner.javaOptsArray`|`JsonArray`|java opts for the task.|`-Xmx1g`|No|
|`druid.indexer.runner.labels`|`JsonObject`|Additional labels you want to add to peon pod|`{}`|No|
|`druid.indexer.runner.annotations`|`JsonObject`|Additional annotations you want to add to peon pod|`{}`|No|
|`druid.indexer.runner.peonMonitors`|`JsonArray`|Overrides `druid.monitoring.monitors`. Use this property if you don't want to inherit monitors from the Overlord.|`[]`|No|
|`druid.indexer.runner.graceTerminationPeriodSeconds`|`Long`|Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods.|`PT30S` (K8s default)|No|
### Gotchas
- You must have in your role the ability to launch jobs.
- All Druid Pods belonging to one Druid cluster must be inside same kubernetes namespace.
- For the sidecar support to work, your entry point / command in docker must be explicitly defined your spec.
You can't have something like this:
Dockerfile:
You can't have something like this:
Dockerfile:
``` ENTRYPOINT: ["foo.sh"] ```
and in your sidecar specs:
and in your sidecar specs:
``` container:
name: foo
args:
@ -102,11 +62,11 @@ and in your sidecar specs:
- arg2
```
That will not work, because we cannot decipher what your command is, the extension needs to know it explicitly.
**Even for sidecars like Istio which are dynamically created by the service mesh, this needs to happen.*
That will not work, because we cannot decipher what your command is, the extension needs to know it explicitly.
**Even for sidecars like Istio which are dynamically created by the service mesh, this needs to happen.*
Instead do the following:
You can keep your Dockerfile the same but you must have a sidecar spec like so:
Instead do the following:
You can keep your Dockerfile the same but you must have a sidecar spec like so:
``` container:
name: foo
command: foo.sh
@ -115,33 +75,195 @@ You can keep your Dockerfile the same but you must have a sidecar spec like so:
- arg2
```
The following roles must also be accessible. An example spec could be:
For both of these adapters, you can add optional labels to your K8s jobs / pods if you need them by using the following configuration:
`druid.indexer.runner.labels: '{"key":"value"}'`
Annotations are the same with:
`druid.indexer.runner.annotations: '{"key":"value"}'`
All other configurations you had for the middle manager tasks must be moved under the overlord with one caveat, you must specify javaOpts as an array:
`druid.indexer.runner.javaOptsArray`, `druid.indexer.runner.javaOpts` is no longer supported.
If you are running without a middle manager you need to also use `druid.processing.intermediaryData.storage.type=deepstore`
### Custom Template Pod Adapter
The custom template pod adapter allows you to specify a pod template file per task type for more flexibility on how to define your pods. This adapter expects a [Pod Template](https://kubernetes.io/docs/concepts/workloads/pods/#pod-templates) to be available on the overlord's file system. This pod template is used as the base of the pod spec for the Kubernetes Job. You can override things like labels, environment variables, resources, annotation, or even the base image with this template. To enable this pod adapter you can specify the runtime property `druid.indexer.runner.k8s.adapter.type: customTemplateAdapter`
The base pod template must be specified as the runtime property `druid.indexer.runner.k8s.podTemplate.base: /path/to/basePodSpec.yaml`
Task specific pod templates can be specified as the runtime property `druid.indexer.runner.k8s.podTemplate.{taskType}: /path/to/taskSpecificPodSpec.yaml` where {taskType} is the name of the task type i.e `index_parallel`
The following is an example Pod Template that uses the regular druid docker image.
```
apiVersion: "v1"
kind: "PodTemplate"
template:
metadata:
annotations:
sidecar.istio.io/proxyCPU: "512m" # to handle a injected istio sidecar
labels:
app.kubernetes.io/name: "druid-realtime-backend"
spec:
affinity: {}
containers:
- command:
- sh
- -c
- |
/peon.sh /druid/data 1
env:
- name: CUSTOM_ENV_VARIABLE
value: "hello"
image: apache/druid:{{DRUIDVERSION}}
name: main
ports:
- containerPort: 8091
name: druid-tls-port
protocol: TCP
- containerPort: 8100
name: druid-port
protocol: TCP
resources:
limits:
cpu: "1"
memory: 2400M
requests:
cpu: "1"
memory: 2400M
volumeMounts:
- mountPath: /opt/druid/conf/druid/cluster/master/coordinator-overlord # runtime props are still mounted in this location because that's where peon.sh looks for configs
name: nodetype-config-volume
readOnly: true
- mountPath: /druid/data
name: data-volume
- mountPath: /druid/deepstorage
name: deepstorage-volume
restartPolicy: "Never"
securityContext:
fsGroup: 1000
runAsGroup: 1000
runAsUser: 1000
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- configMap:
defaultMode: 420
name: druid-tiny-cluster-peons-config
name: nodetype-config-volume
- emptyDir: {}
name: data-volume
- emptyDir: {}
name: deepstorage-volume
```
The below runtime properties need to be passed to the Job's peon process.
```
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
druid.port=8100 (what port the peon should run on)
druid.peon.mode=remote
druid.service=druid/peon (for metrics reporting)
druid.indexer.task.baseTaskDir=/druid/data (this should match the argument to the ./peon.sh run command in the PodTemplate)
druid.indexer.runner.type=k8s
druid.indexer.task.encapsulatedTask=true
```
Any runtime property or JVM config used by the peon process can also be passed. E.G. below is a example of a ConfigMap that can be used to generate the `nodetype-config-volume` mount in the above template.
```
kind: ConfigMap
metadata:
name: druid-cluster
name: druid-tiny-cluster-peons-config
namespace: default
apiVersion: v1
data:
jvm.config: |-
-server
-XX:MaxDirectMemorySize=1000M
-Duser.timezone=UTC
-Dfile.encoding=UTF-8
-Dlog4j.debug
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
-Djava.io.tmpdir=/druid/data
-Xmx1024M
-Xms1024M
log4j2.xml: |-
<?xml version="1.0" encoding="UTF-8" ?>
<Configuration status="WARN">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console"/>
</Root>
</Loggers>
</Configuration>
runtime.properties: |
druid.port=8100
druid.service=druid/peon
druid.server.http.numThreads=5
druid.indexer.task.baseTaskDir=/druid/data
druid.indexer.runner.type=k8s
druid.peon.mode=remote
druid.indexer.task.encapsulatedTask=true
```
### Properties
|Property| Possible Values | Description |Default|required|
|--------|-----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|--------|
|`druid.indexer.runner.debugJobs`| `boolean` | Clean up K8s jobs after tasks complete. |False|No|
|`druid.indexer.runner.sidecarSupport`| `boolean` | Deprecated, specify adapter type as runtime property `druid.indexer.runner.k8s.adapter.type: overlordMultiContainer` instead. If your overlord pod has sidecars, this will attempt to start the task with the same sidecars as the overlord pod. |False|No|
|`druid.indexer.runner.primaryContainerName`| `String` | If running with sidecars, the `primaryContainerName` should be that of your druid container like `druid-overlord`. |First container in `podSpec` list|No|
|`druid.indexer.runner.kubexitImage`| `String` | Used kubexit project to help shutdown sidecars when the main pod completes. Otherwise jobs with sidecars never terminate. |karlkfi/kubexit:latest|No|
|`druid.indexer.runner.disableClientProxy`| `boolean` | Use this if you have a global http(s) proxy and you wish to bypass it. |false|No|
|`druid.indexer.runner.maxTaskDuration`| `Duration` | Max time a task is allowed to run for before getting killed |`PT4H`|No|
|`druid.indexer.runner.taskCleanupDelay`| `Duration` | How long do jobs stay around before getting reaped from K8s |`P2D`|No|
|`druid.indexer.runner.taskCleanupInterval`| `Duration` | How often to check for jobs to be reaped |`PT10M`|No|
|`druid.indexer.runner.K8sjobLaunchTimeout`| `Duration` | How long to wait to launch a K8s task before marking it as failed, on a resource constrained cluster it may take some time. |`PT1H`|No|
|`druid.indexer.runner.javaOptsArray`| `JsonArray` | java opts for the task. |`-Xmx1g`|No|
|`druid.indexer.runner.labels`| `JsonObject` | Additional labels you want to add to peon pod |`{}`|No|
|`druid.indexer.runner.annotations`| `JsonObject` | Additional annotations you want to add to peon pod |`{}`|No|
|`druid.indexer.runner.peonMonitors`| `JsonArray` | Overrides `druid.monitoring.monitors`. Use this property if you don't want to inherit monitors from the Overlord. |`[]`|No|
|`druid.indexer.runner.graceTerminationPeriodSeconds`| `Long` | Number of seconds you want to wait after a sigterm for container lifecycle hooks to complete. Keep at a smaller value if you want tasks to hold locks for shorter periods. |`PT30S` (K8s default)|No|
|`druid.indexer.runner.capacity`| `Integer` | Number of concurrent jobs that can be sent to Kubernetes. |`2147483647`|No|
### Gotchas
- All Druid Pods belonging to one Druid cluster must be inside the same Kubernetes namespace.
- You must have a role binding for the overlord's service account that provides the needed permissions for interacting with Kubernetes. An example spec could be:
```
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
namespace: <druid-namespace>
name: druid-k8s-task-scheduler
rules:
- apiGroups:
- ""
- batch
resources:
- pods
- configmaps
- jobs
verbs:
- '*'
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "watch", "list", "delete", "create"]
- apiGroups: [""]
resources: ["pods", "pods/log"]
verbs: ["get", "watch", "list", "delete", "create"]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: druid-cluster
name: druid-k8s-binding
namespace: <druid-namespace>
subjects:
- kind: ServiceAccount
name: default
- kind: ServiceAccount
name: <druid-overlord-k8s-service-account>
namespace: <druid-namespace>
roleRef:
kind: Role
name: druid-cluster
name: druid-k8s-task-scheduler
apiGroup: rbac.authorization.k8s.io
```
```

View File

@ -59,7 +59,7 @@ You also need to include the [Hadoop AWS module](https://hadoop.apache.org/docs/
Run the below command to install the `hadoop-aws.jar` file under `${DRUID_HOME}/extensions/druid-hdfs-storage` in all nodes.
```bash
java -classpath "${DRUID_HOME}lib/*" org.apache.druid.cli.Main tools pull-deps -h "org.apache.hadoop:hadoop-aws:${HADOOP_VERSION}";
${DRUID_HOME}/bin/run-java -classpath "${DRUID_HOME}/lib/*" org.apache.druid.cli.Main tools pull-deps -h "org.apache.hadoop:hadoop-aws:${HADOOP_VERSION}";
cp ${DRUID_HOME}/hadoop-dependencies/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar ${DRUID_HOME}/extensions/druid-hdfs-storage/
```

View File

@ -195,7 +195,7 @@ The `tuningConfig` is optional and default parameters will be used if no `tuning
| Field | Type | Description | Required |
|-----------------------------------|----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|
| `type` | String | The indexing task type, this should always be `kafka`. | yes |
| `maxRowsInMemory` | Integer | The number of rows to aggregate before persisting. This number is the post-aggregation rows, so it is not equivalent to the number of input events, but the number of aggregated rows that those events result in. This is used to manage the required JVM heap size. Maximum heap memory usage for indexing scales with `maxRowsInMemory` * (2 + `maxPendingPersists`). Normally user does not need to set this, but depending on the nature of data, if rows are short in terms of bytes, user may not want to store a million rows in memory and this value should be set. | no (default == 1000000) |
| `maxRowsInMemory` | Integer | The number of rows to aggregate before persisting. This number is the post-aggregation rows, so it is not equivalent to the number of input events, but the number of aggregated rows that those events result in. This is used to manage the required JVM heap size. Maximum heap memory usage for indexing scales with `maxRowsInMemory` * (2 + `maxPendingPersists`). Normally user does not need to set this, but depending on the nature of data, if rows are short in terms of bytes, user may not want to store a million rows in memory and this value should be set. | no (default == 150000) |
| `maxBytesInMemory` | Long | The number of bytes to aggregate in heap memory before persisting. This is based on a rough estimate of memory usage and not actual usage. Normally this is computed internally and user does not need to set it. The maximum heap memory usage for indexing is `maxBytesInMemory` * (2 + `maxPendingPersists`). | no (default == One-sixth of max JVM memory) |
| `maxRowsPerSegment` | Integer | The number of rows to aggregate into a segment; this number is post-aggregation rows. Handoff will happen either if `maxRowsPerSegment` or `maxTotalRows` is hit or every `intermediateHandoffPeriod`, whichever happens earlier. | no (default == 5000000) |
| `maxTotalRows` | Long | The number of rows to aggregate across all segments; this number is post-aggregation rows. Handoff will happen either if `maxRowsPerSegment` or `maxTotalRows` is hit or every `intermediateHandoffPeriod`, whichever happens earlier. | no (default == 20000000) |
@ -204,11 +204,9 @@ The `tuningConfig` is optional and default parameters will be used if no `tuning
| `indexSpec` | Object | Tune how data is indexed. See [IndexSpec](#indexspec) for more information. | no |
| `indexSpecForIntermediatePersists`| | Defines segment storage format options to be used at indexing time for intermediate persisted temporary segments. This can be used to disable dimension/metric compression on intermediate segments to reduce memory required for final merging. However, disabling compression on intermediate segments might increase page cache use while they are used before getting merged into final segment published, see [IndexSpec](#indexspec) for possible values. | no (default = same as `indexSpec`) |
| `reportParseExceptions` | Boolean | *DEPRECATED*. If true, exceptions encountered during parsing will be thrown and will halt ingestion; if false, unparseable rows and fields will be skipped. Setting `reportParseExceptions` to true will override existing configurations for `maxParseExceptions` and `maxSavedParseExceptions`, setting `maxParseExceptions` to 0 and limiting `maxSavedParseExceptions` to no more than 1. | no (default == false) |
| `handoffConditionTimeout` | Long | Milliseconds to wait for segment handoff. It must be >= 0, where 0 means to wait forever. | no (default == 0) |
| `handoffConditionTimeout` | Long | Number of milliseconds to wait for segment handoff. Set to a value >= 0, where 0 means to wait indefinitely. | no (default == 900000 [15 minutes]) |
| `resetOffsetAutomatically` | Boolean | Controls behavior when Druid needs to read Kafka messages that are no longer available (i.e. when `OffsetOutOfRangeException` is encountered).<br/><br/>If false, the exception will bubble up, which will cause your tasks to fail and ingestion to halt. If this occurs, manual intervention is required to correct the situation; potentially using the [Reset Supervisor API](../../api-reference/supervisor-api.md). This mode is useful for production, since it will make you aware of issues with ingestion.<br/><br/>If true, Druid will automatically reset to the earlier or latest offset available in Kafka, based on the value of the `useEarliestOffset` property (earliest if true, latest if false). Note that this can lead to data being _DROPPED_ (if `useEarliestOffset` is false) or _DUPLICATED_ (if `useEarliestOffset` is true) without your knowledge. Messages will be logged indicating that a reset has occurred, but ingestion will continue. This mode is useful for non-production situations, since it will make Druid attempt to recover from problems automatically, even if they lead to quiet dropping or duplicating of data.<br/><br/>This feature behaves similarly to the Kafka `auto.offset.reset` consumer property. | no (default == false) |
| `workerThreads` | Integer | The number of threads that the supervisor uses to handle requests/responses for worker tasks, along with any other internal asynchronous operation. | no (default == min(10, taskCount)) |
| `chatAsync` | Boolean | If true, use asynchronous communication with indexing tasks, and ignore the `chatThreads` parameter. If false, use synchronous communication in a thread pool of size `chatThreads`. | no (default == true) |
| `chatThreads` | Integer | The number of threads that will be used for communicating with indexing tasks. Ignored if `chatAsync` is `true` (the default). | no (default == min(10, taskCount * replicas)) |
| `chatRetries` | Integer | The number of times HTTP requests to indexing tasks will be retried before considering tasks unresponsive. | no (default == 8) |
| `httpTimeout` | ISO8601 Period | How long to wait for a HTTP response from an indexing task. | no (default == PT10S) |
| `shutdownTimeout` | ISO8601 Period | How long to wait for the supervisor to attempt a graceful shutdown of tasks before exiting. | no (default == PT80S) |

View File

@ -23,154 +23,258 @@ sidebar_label: "Amazon Kinesis"
~ under the License.
-->
When you enable the Kinesis indexing service, you can configure *supervisors* on the Overlord to manage the creation and lifetime of Kinesis indexing tasks. These indexing tasks read events using Kinesis' own shard and sequence number mechanism to guarantee exactly-once ingestion. The supervisor oversees the state of the indexing tasks to:
When you enable the Kinesis indexing service, you can configure supervisors on the Overlord to manage the creation and lifetime of Kinesis indexing tasks. These indexing tasks read events using Kinesis' own shard and sequence number mechanism to guarantee exactly-once ingestion. The supervisor oversees the state of the indexing tasks to coordinate handoffs, manage failures, and ensure that scalability and replication requirements are maintained.
- coordinate handoffs
- manage failures
- ensure that scalability and replication requirements are maintained.
This topic contains configuration reference information for the Kinesis indexing service supervisor for Apache Druid.
To use the Kinesis indexing service, load the `druid-kinesis-indexing-service` core Apache Druid extension (see
[Including Extensions](../../configuration/extensions.md#loading-extensions)).
## Setup
> Before you deploy the Kinesis extension to production, read the [Kinesis known issues](#kinesis-known-issues).
To use the Kinesis indexing service, you must first load the `druid-kinesis-indexing-service` core extension on both the Overlord and the Middle Manager. See [Loading extensions](../../configuration/extensions.md#loading-extensions) for more information.
Review the [Kinesis known issues](#kinesis-known-issues) before deploying the `druid-kinesis-indexing-service` extension to production.
## Submitting a Supervisor Spec
## Supervisor spec
To use the Kinesis indexing service, load the `druid-kinesis-indexing-service` extension on both the Overlord and the MiddleManagers. Druid starts a supervisor for a dataSource when you submit a supervisor spec. Submit your supervisor spec to the following endpoint:
The following table outlines the high-level configuration options for the Kinesis supervisor object.
See [Supervisor API](../../api-reference/supervisor-api.md) for more information.
`http://<OVERLORD_IP>:<OVERLORD_PORT>/druid/indexer/v1/supervisor`
|Property|Type|Description|Required|
|--------|----|-----------|--------|
|`type`|String|The supervisor type; this should always be `kinesis`.|Yes|
|`spec`|Object|The container object for the supervisor configuration.|Yes|
|`ioConfig`|Object|The [I/O configuration](#supervisor-io-configuration) object for configuring Kafka connection and I/O-related settings for the supervisor and indexing task.|Yes|
|`dataSchema`|Object|The schema used by the Kinesis indexing task during ingestion. See [`dataSchema`](../../ingestion/ingestion-spec.md#dataschema) for more information.|Yes|
|`tuningConfig`|Object|The [tuning configuration](#supervisor-tuning-configuration) object for configuring performance-related settings for the supervisor and indexing tasks.|No|
For example:
Druid starts a new supervisor when you define a supervisor spec.
To create a supervisor, send a `POST` request to the `/druid/indexer/v1/supervisor` endpoint.
Once created, the supervisor persists in the configured metadata database. There can only be a single supervisor per datasource, and submitting a second spec for the same datasource overwrites the previous one.
```sh
curl -X POST -H 'Content-Type: application/json' -d @supervisor-spec.json http://localhost:8090/druid/indexer/v1/supervisor
When an Overlord gains leadership, either by being started or as a result of another Overlord failing, it spawns
a supervisor for each supervisor spec in the metadata database. The supervisor then discovers running Kinesis indexing
tasks and attempts to adopt them if they are compatible with the supervisor's configuration. If they are not
compatible because they have a different ingestion spec or shard allocation, the tasks are killed and the
supervisor creates a new set of tasks. In this way, the supervisors persist across Overlord restarts and failovers.
The following example shows how to submit a supervisor spec for a stream with the name `KinesisStream`.
In this example, `http://SERVICE_IP:SERVICE_PORT` is a placeholder for the server address of deployment and the service port.
<!--DOCUSAURUS_CODE_TABS-->
<!--cURL-->
```shell
curl -X POST "http://SERVICE_IP:SERVICE_PORT/druid/indexer/v1/supervisor" \
-H "Content-Type: application/json" \
-d '{
"type": "kinesis",
"spec": {
"ioConfig": {
"type": "kinesis",
"stream": "KinesisStream",
"inputFormat": {
"type": "json"
},
"useEarliestSequenceNumber": true
},
"tuningConfig": {
"type": "kinesis"
},
"dataSchema": {
"dataSource": "KinesisStream",
"timestampSpec": {
"column": "timestamp",
"format": "iso"
},
"dimensionsSpec": {
"dimensions": [
"isRobot",
"channel",
"flags",
"isUnpatrolled",
"page",
"diffUrl",
{
"type": "long",
"name": "added"
},
"comment",
{
"type": "long",
"name": "commentLength"
},
"isNew",
"isMinor",
{
"type": "long",
"name": "delta"
},
"isAnonymous",
"user",
{
"type": "long",
"name": "deltaBucket"
},
{
"type": "long",
"name": "deleted"
},
"namespace",
"cityName",
"countryName",
"regionIsoCode",
"metroCode",
"countryIsoCode",
"regionName"
]
},
"granularitySpec": {
"queryGranularity": "none",
"rollup": false,
"segmentGranularity": "hour"
}
}
}
}'
```
<!--HTTP-->
```HTTP
POST /druid/indexer/v1/supervisor
HTTP/1.1
Host: http://SERVICE_IP:SERVICE_PORT
Content-Type: application/json
Where the file `supervisor-spec.json` contains a Kinesis supervisor spec:
```json
{
"type": "kinesis",
"spec": {
"ioConfig": {
"type": "kinesis",
"stream": "KinesisStream",
"inputFormat": {
"type": "json"
},
"useEarliestSequenceNumber": true
},
"tuningConfig": {
"type": "kinesis"
},
"dataSchema": {
"dataSource": "metrics-kinesis",
"dataSource": "KinesisStream",
"timestampSpec": {
"column": "timestamp",
"format": "auto"
"format": "iso"
},
"dimensionsSpec": {
"dimensions": [],
"dimensionExclusions": [
"timestamp",
"value"
]
"dimensionsSpec": {
"dimensions": [
"isRobot",
"channel",
"flags",
"isUnpatrolled",
"page",
"diffUrl",
{
"type": "long",
"name": "added"
},
"comment",
{
"type": "long",
"name": "commentLength"
},
"isNew",
"isMinor",
{
"type": "long",
"name": "delta"
},
"isAnonymous",
"user",
{
"type": "long",
"name": "deltaBucket"
},
{
"type": "long",
"name": "deleted"
},
"namespace",
"cityName",
"countryName",
"regionIsoCode",
"metroCode",
"countryIsoCode",
"regionName"
]
},
"metricsSpec": [
{
"name": "count",
"type": "count"
},
{
"name": "value_sum",
"fieldName": "value",
"type": "doubleSum"
},
{
"name": "value_min",
"fieldName": "value",
"type": "doubleMin"
},
{
"name": "value_max",
"fieldName": "value",
"type": "doubleMax"
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": "NONE"
}
},
"ioConfig": {
"stream": "metrics",
"inputFormat": {
"type": "json"
},
"endpoint": "kinesis.us-east-1.amazonaws.com",
"taskCount": 1,
"replicas": 1,
"taskDuration": "PT1H"
},
"tuningConfig": {
"type": "kinesis",
"maxRowsPerSegment": 5000000
}
"granularitySpec": {
"queryGranularity": "none",
"rollup": false,
"segmentGranularity": "hour"
}
}
}
}
```
<!--END_DOCUSAURUS_CODE_TABS-->
## Supervisor Spec
## Supervisor I/O configuration
|Field|Description|Required|
|--------|-----------|---------|
|`type`|The supervisor type; this should always be `kinesis`.|yes|
|`spec`|Container object for the supervisor configuration.|yes|
|`dataSchema`|The schema that will be used by the Kinesis indexing task during ingestion. See [`dataSchema`](../../ingestion/ingestion-spec.md#dataschema).|yes|
|`ioConfig`|An [`ioConfig`](#ioconfig) object for configuring Kafka connection and I/O-related settings for the supervisor and indexing task.|yes|
|`tuningConfig`|A [`tuningConfig`](#tuningconfig) object for configuring performance-related settings for the supervisor and indexing tasks.|no|
The following table outlines the configuration options for `ioConfig`:
### `ioConfig`
|Property|Type|Description|Required|Default|
|--------|----|-----------|--------|-------|
|`stream`|String|The Kinesis stream to read.|Yes||
|`inputFormat`|Object|The [input format](../../ingestion/data-formats.md#input-format) to specify how to parse input data. See [Specify data format](#specify-data-format) for more information.|Yes||
|`endpoint`|String|The AWS Kinesis stream endpoint for a region. You can find a list of endpoints in the [AWS service endpoints](http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region) document.|No|`kinesis.us-east-1.amazonaws.com`|
|`replicas`|Integer|The number of replica sets, where 1 is a single set of tasks (no replication). Druid always assigns replicate tasks to different workers to provide resiliency against process failure.|No|1|
|`taskCount`|Integer|The maximum number of reading tasks in a replica set. Multiply `taskCount` and `replicas` to measure the maximum number of reading tasks. <br />The total number of tasks (reading and publishing) is higher than the maximum number of reading tasks. See [Capacity planning](#capacity-planning) for more details. When `taskCount > {numKinesisShards}`, the actual number of reading tasks is less than the `taskCount` value.|No|1|
|`taskDuration`|ISO 8601 period|The length of time before tasks stop reading and begin publishing their segments.|No|PT1H|
|`startDelay`|ISO 8601 period|The period to wait before the supervisor starts managing tasks.|No|PT5S|
|`period`|ISO 8601 period|Determines how often the supervisor executes its management logic. Note that the supervisor also runs in response to certain events, such as tasks succeeding, failing, and reaching their task duration, so this value specifies the maximum time between iterations.|No|PT30S|
|`useEarliestSequenceNumber`|Boolean|If a supervisor is managing a datasource for the first time, it obtains a set of starting sequence numbers from Kinesis. This flag determines whether a supervisor retrieves the earliest or latest sequence numbers in Kinesis. Under normal circumstances, subsequent tasks start from where the previous segments ended so this flag is only used on the first run.|No|`false`|
|`completionTimeout`|ISO 8601 period|The length of time to wait before Druid declares a publishing task has failed and terminates it. If this is set too low, your tasks may never publish. The publishing clock for a task begins roughly after `taskDuration` elapses.|No|PT6H|
|`lateMessageRejectionPeriod`|ISO 8601 period|Configure tasks to reject messages with timestamps earlier than this period before the task is created. For example, if `lateMessageRejectionPeriod` is set to `PT1H` and the supervisor creates a task at `2016-01-01T12:00Z`, messages with timestamps earlier than `2016-01-01T11:00Z` are dropped. This may help prevent concurrency issues if your data stream has late messages and you have multiple pipelines that need to operate on the same segments, such as a streaming and a nightly batch ingestion pipeline.|No||
|`earlyMessageRejectionPeriod`|ISO 8601 period|Configure tasks to reject messages with timestamps later than this period after the task reached its `taskDuration`. For example, if `earlyMessageRejectionPeriod` is set to `PT1H`, the `taskDuration` is set to `PT1H` and the supervisor creates a task at `2016-01-01T12:00Z`. Messages with timestamps later than `2016-01-01T14:00Z` are dropped. **Note:** Tasks sometimes run past their task duration, for example, in cases of supervisor failover. Setting `earlyMessageRejectionPeriod` too low may cause messages to be dropped unexpectedly whenever a task runs past its originally configured task duration.|No||
|`recordsPerFetch`|Integer|The number of records to request per call to fetch records from Kinesis.|No| See [Determine fetch settings](#determine-fetch-settings) for defaults.|
|`fetchDelayMillis`|Integer|Time in milliseconds to wait between subsequent calls to fetch records from Kinesis. See [Determine fetch settings](#determine-fetch-settings).|No|0|
|`awsAssumedRoleArn`|String|The AWS assumed role to use for additional permissions.|No||
|`awsExternalId`|String|The AWS external ID to use for additional permissions.|No||
|`deaggregate`|Boolean|Whether to use the deaggregate function of the Kinesis Client Library (KCL).|No||
|`autoScalerConfig`|Object|Defines autoscaling behavior for Kinesis ingest tasks. See [Task autoscaler properties](#task-autoscaler-properties) for more information.|No|null|
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`stream`|String|The Kinesis stream to read.|yes|
|`inputFormat`|Object|[`inputFormat`](../../ingestion/data-formats.md#input-format) to specify how to parse input data. See [Specifying data format](#specifying-data-format) for details about specifying the input format.|yes|
|`endpoint`|String|The AWS Kinesis stream endpoint for a region. You can find a list of endpoints [here](http://docs.aws.amazon.com/general/latest/gr/rande.html#ak_region).|no (default == kinesis.us-east-1.amazonaws.com)|
|`replicas`|Integer|The number of replica sets, where 1 means a single set of tasks (no replication). Replica tasks will always be assigned to different workers to provide resiliency against process failure.|no (default == 1)|
|`taskCount`|Integer|The maximum number of *reading* tasks in a *replica set*. This means that the maximum number of reading tasks will be `taskCount * replicas` and the total number of tasks (*reading* + *publishing*) will be higher than this. See [Capacity Planning](#capacity-planning) below for more details. The number of reading tasks will be less than `taskCount` if `taskCount > {numKinesisShards}`.|no (default == 1)|
|`taskDuration`|ISO8601 Period|The length of time before tasks stop reading and begin publishing their segment.|no (default == PT1H)|
|`startDelay`|ISO8601 Period|The period to wait before the supervisor starts managing tasks.|no (default == PT5S)|
|`period`|ISO8601 Period|How often the supervisor will execute its management logic. Note that the supervisor will also run in response to certain events (such as tasks succeeding, failing, and reaching their taskDuration) so this value specifies the maximum time between iterations.|no (default == PT30S)|
|`useEarliestSequenceNumber`|Boolean|If a supervisor is managing a dataSource for the first time, it will obtain a set of starting sequence numbers from Kinesis. This flag determines whether it retrieves the earliest or latest sequence numbers in Kinesis. Under normal circumstances, subsequent tasks will start from where the previous segments ended so this flag will only be used on first run.|no (default == false)|
|`completionTimeout`|ISO8601 Period|The length of time to wait before declaring a publishing task as failed and terminating it. If this is set too low, your tasks may never publish. The publishing clock for a task begins roughly after `taskDuration` elapses.|no (default == PT6H)|
|`lateMessageRejectionPeriod`|ISO8601 Period|Configure tasks to reject messages with timestamps earlier than this period before the task was created; for example if this is set to `PT1H` and the supervisor creates a task at *2016-01-01T12:00Z*, messages with timestamps earlier than *2016-01-01T11:00Z* will be dropped. This may help prevent concurrency issues if your data stream has late messages and you have multiple pipelines that need to operate on the same segments (e.g. a streaming and a nightly batch ingestion pipeline).|no (default == none)|
|`earlyMessageRejectionPeriod`|ISO8601 Period|Configure tasks to reject messages with timestamps later than this period after the task reached its taskDuration; for example if this is set to `PT1H`, the taskDuration is set to `PT1H` and the supervisor creates a task at *2016-01-01T12:00Z*. Messages with timestamps later than *2016-01-01T14:00Z* will be dropped. **Note:** Tasks sometimes run past their task duration, for example, in cases of supervisor failover. Setting `earlyMessageRejectionPeriod` too low may cause messages to be dropped unexpectedly whenever a task runs past its originally configured task duration.|no (default == none)|
|`recordsPerFetch`|Integer|The number of records to request per call to fetch records from Kinesis. See [Determining fetch settings](#determining-fetch-settings).|no (see [Determining fetch settings](#determining-fetch-settings) for defaults)|
|`fetchDelayMillis`|Integer|Time in milliseconds to wait between subsequent calls to fetch records from Kinesis. See [Determining fetch settings](#determining-fetch-settings).|no (default == 0)|
|`awsAssumedRoleArn`|String|The AWS assumed role to use for additional permissions.|no|
|`awsExternalId`|String|The AWS external id to use for additional permissions.|no|
|`deaggregate`|Boolean|Whether to use the de-aggregate function of the KCL. See below for details.|no|
|`autoScalerConfig`|Object|Defines auto scaling behavior for Kinesis ingest tasks. See [Tasks Autoscaler Properties](#task-autoscaler-properties).|no (default == null)|
### Task autoscaler properties
#### Task Autoscaler Properties
The following table outlines the configuration options for `autoScalerConfig`:
| Property | Description | Required |
| ------------- | ------------- | ------------- |
| `enableTaskAutoScaler` | Enable or disable the auto scaler. When false or absent, Druid disables the `autoScaler` even when `autoScalerConfig` is not null.| no (default == false) |
| `taskCountMax` | Maximum number of Kinesis ingestion tasks. Must be greater than or equal to `taskCountMin`. If greater than `{numKinesisShards}`, the maximum number of reading tasks is `{numKinesisShards}` and `taskCountMax` is ignored. | yes |
| `taskCountMin` | Minimum number of Kinesis ingestion tasks. When you enable the auto scaler, Druid ignores the value of taskCount in `IOConfig` and uses`taskCountMin` for the initial number of tasks to launch.| yes |
| `minTriggerScaleActionFrequencyMillis` | Minimum time interval between two scale actions | no (default == 600000) |
| `autoScalerStrategy` | The algorithm of `autoScaler`. ONLY `lagBased` is supported for now. See [Lag Based AutoScaler Strategy Related Properties](#lag-based-autoscaler-strategy-related-properties) for details.| no (default == `lagBased`) |
|Property|Description|Required|Default|
|--------|-----------|--------|-------|
|`enableTaskAutoScaler`|Enables the auto scaler. If not specified, Druid disables the auto scaler even when `autoScalerConfig` is not null.|No|`false`|
|`taskCountMax`|Maximum number of Kinesis ingestion tasks. Must be greater than or equal to `taskCountMin`. If greater than `{numKinesisShards}`, Druid sets the maximum number of reading tasks to `{numKinesisShards}` and ignores `taskCountMax`.|Yes||
|`taskCountMin`|Minimum number of Kinesis ingestion tasks. When you enable the auto scaler, Druid ignores the value of `taskCount` in `IOConfig` and uses `taskCountMin` for the initial number of tasks to launch.|Yes||
|`minTriggerScaleActionFrequencyMillis`|Minimum time interval between two scale actions.| No|600000|
|`autoScalerStrategy`|The algorithm of `autoScaler`. Druid only supports the `lagBased` strategy. See [Lag based autoscaler strategy related properties](#lag-based-autoscaler-strategy-related-properties) for more information.|No|Defaults to `lagBased`.|
##### Lag Based AutoScaler Strategy Related Properties
### Lag based autoscaler strategy related properties
The Kinesis indexing service reports lag metrics measured in time milliseconds rather than message count which is used by Kafka.
Unlike the Kafka indexing service, Kinesis reports lag metrics measured in time difference in milliseconds between the current sequence number and latest sequence number, rather than message count.
| Property | Description | Required |
| ------------- | ------------- | ------------- |
| `lagCollectionIntervalMillis` | Period of lag points collection. | no (default == 30000) |
| `lagCollectionRangeMillis` | The total time window of lag collection, Use with `lagCollectionIntervalMillis`it means that in the recent `lagCollectionRangeMillis`, collect lag metric points every `lagCollectionIntervalMillis`. | no (default == 600000) |
| `scaleOutThreshold` | The Threshold of scale out action | no (default == 6000000) |
| `triggerScaleOutFractionThreshold` | If `triggerScaleOutFractionThreshold` percent of lag points are higher than `scaleOutThreshold`, then do scale out action. | no (default == 0.3) |
| `scaleInThreshold` | The Threshold of scale in action | no (default == 1000000) |
| `triggerScaleInFractionThreshold` | If `triggerScaleInFractionThreshold` percent of lag points are lower than `scaleOutThreshold`, then do scale in action. | no (default == 0.9) |
| `scaleActionStartDelayMillis` | Number of milliseconds to delay after the supervisor starts before the first scale logic check. | no (default == 300000) |
| `scaleActionPeriodMillis` | Frequency in milliseconds to check if a scale action is triggered | no (default == 60000) |
| `scaleInStep` | Number of tasks to reduce at a time when scaling down | no (default == 1) |
| `scaleOutStep` | Number of tasks to add at a time when scaling out | no (default == 2) |
The following table outlines the configuration options for `autoScalerStrategy`:
The following example demonstrates a supervisor spec with `lagBased` autoScaler enabled:
|Property|Description|Required|Default|
|--------|-----------|--------|-------|
|`lagCollectionIntervalMillis`|The time period during which Druid collects lag metric points.|No|30000|
|`lagCollectionRangeMillis`|The total time window of lag collection. Use with `lagCollectionIntervalMillis` to specify the intervals at which to collect lag metric points.|No|600000|
|`scaleOutThreshold`|The threshold of scale out action. |No|6000000|
|`triggerScaleOutFractionThreshold`|Enables scale out action if `triggerScaleOutFractionThreshold` percent of lag points is higher than `scaleOutThreshold`.|No|0.3|
|`scaleInThreshold`|The threshold of scale in action.|No|1000000|
|`triggerScaleInFractionThreshold`|Enables scale in action if `triggerScaleInFractionThreshold` percent of lag points is lower than `scaleOutThreshold`.|No|0.9|
|`scaleActionStartDelayMillis`|The number of milliseconds to delay after the supervisor starts before the first scale logic check.|No|300000|
|`scaleActionPeriodMillis`|The frequency in milliseconds to check if a scale action is triggered.|No|60000|
|`scaleInStep`|The number of tasks to reduce at once when scaling down.|No|1|
|`scaleOutStep`|The number of tasks to add at once when scaling out.|No|2|
The following example shows a supervisor spec with `lagBased` auto scaler enabled.
<details>
<summary>Click to view the example</summary>
```json
{
@ -249,10 +353,12 @@ The following example demonstrates a supervisor spec with `lagBased` autoScaler
}
```
#### Specifying data format
</details>
Kinesis indexing service supports both [`inputFormat`](../../ingestion/data-formats.md#input-format) and [`parser`](../../ingestion/data-formats.md#parser) to specify the data format.
Use the `inputFormat` to specify the data format for Kinesis indexing service unless you need a format only supported by the legacy `parser`.
### Specify data format
The Kinesis indexing service supports both [`inputFormat`](../../ingestion/data-formats.md#input-format) and [`parser`](../../ingestion/data-formats.md#parser) to specify the data format.
Use the `inputFormat` to specify the data format for the Kinesis indexing service unless you need a format only supported by the legacy `parser`.
Supported values for `inputFormat` include:
@ -265,104 +371,83 @@ Supported values for `inputFormat` include:
For more information, see [Data formats](../../ingestion/data-formats.md). You can also read [`thrift`](../extensions-contrib/thrift.md) formats using `parser`.
<a name="tuningconfig"></a>
## Supervisor tuning configuration
### `tuningConfig`
The `tuningConfig` object is optional. If you don't specify the `tuningConfig` object, Druid uses the default configuration settings.
The `tuningConfig` is optional. If no `tuningConfig` is specified, default parameters are used.
The following table outlines the configuration options for `tuningConfig`:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`type`| String|The indexing task type, this should always be `kinesis`.|yes|
|`maxRowsInMemory`|Integer|The number of rows to aggregate before persisting. This number is the post-aggregation rows, so it is not equivalent to the number of input events, but the number of aggregated rows that those events result in. This is used to manage the required JVM heap size. Maximum heap memory usage for indexing scales with `maxRowsInMemory * (2 + maxPendingPersists)`.|no (default == 100000)|
|`maxBytesInMemory`|Long| The number of bytes to aggregate in heap memory before persisting. This is based on a rough estimate of memory usage and not actual usage. Normally, this is computed internally and user does not need to set it. The maximum heap memory usage for indexing is `maxBytesInMemory * (2 + maxPendingPersists)`.|no (default == One-sixth of max JVM memory)|
|`maxRowsPerSegment`|Integer|The number of rows to aggregate into a segment; this number is post-aggregation rows. Handoff will happen either if `maxRowsPerSegment` or `maxTotalRows` is hit or every `intermediateHandoffPeriod`, whichever happens earlier.|no (default == 5000000)|
|`maxTotalRows`|Long|The number of rows to aggregate across all segments; this number is post-aggregation rows. Handoff will happen either if `maxRowsPerSegment` or `maxTotalRows` is hit or every `intermediateHandoffPeriod`, whichever happens earlier.|no (default == unlimited)|
|`intermediatePersistPeriod`|ISO8601 Period|The period that determines the rate at which intermediate persists occur.|no (default == PT10M)|
|`maxPendingPersists`|Integer|Maximum number of persists that can be pending but not started. If this limit would be exceeded by a new intermediate persist, ingestion will block until the currently-running persist finishes. Maximum heap memory usage for indexing scales with `maxRowsInMemory * (2 + maxPendingPersists)`.|no (default == 0, meaning one persist can be running concurrently with ingestion, and none can be queued up)|
|`indexSpec`|Object|Tune how data is indexed. See [IndexSpec](#indexspec) for more information.|no|
|`indexSpecForIntermediatePersists`|Object|Defines segment storage format options to be used at indexing time for intermediate persisted temporary segments. This can be used to disable dimension/metric compression on intermediate segments to reduce memory required for final merging. However, disabling compression on intermediate segments might increase page cache use while they are used before getting merged into final segment published, see [IndexSpec](#indexspec) for possible values.| no (default = same as `indexSpec`)|
|`reportParseExceptions`|Boolean|If true, exceptions encountered during parsing will be thrown and will halt ingestion; if false, unparseable rows and fields will be skipped.|no (default == false)|
|`handoffConditionTimeout`|Long| Milliseconds to wait for segment handoff. It must be >= 0, where 0 means to wait forever.| no (default == 0)|
|`resetOffsetAutomatically`|Boolean|Controls behavior when Druid needs to read Kinesis messages that are no longer available.<br/><br/>If false, the exception bubbles up, causing tasks to fail and ingestion to halt. If this occurs, manual intervention is required to correct the situation, potentially using the [Reset Supervisor API](../../api-reference/supervisor-api.md). This mode is useful for production, since it highlights issues with ingestion.<br/><br/>If true, Druid automatically resets to the earliest or latest sequence number available in Kinesis, based on the value of the `useEarliestSequenceNumber` property (earliest if true, latest if false). Note that this can lead to data being *DROPPED* (if `useEarliestSequenceNumber` is false) or *DUPLICATED* (if `useEarliestSequenceNumber` is true) without your knowledge. Druid will log messages indicating that a reset has occurred without interrupting ingestion. This mode is useful for non-production situations since it enables Druid to recover from problems automatically, even if they lead to quiet dropping or duplicating of data.|no (default == false)|
|`skipSequenceNumberAvailabilityCheck`|Boolean|Whether to enable checking if the current sequence number is still available in a particular Kinesis shard. If set to false, the indexing task will attempt to reset the current sequence number (or not), depending on the value of `resetOffsetAutomatically`.|no (default == false)|
|`workerThreads`|Integer|The number of threads that the supervisor uses to handle requests/responses for worker tasks, along with any other internal asynchronous operation.|no (default == min(10, taskCount))|
|`chatAsync`|Boolean| If true, the supervisor uses asynchronous communication with indexing tasks and ignores the `chatThreads` parameter. If false, the supervisor uses synchronous communication in a thread pool of size `chatThreads`.| no (default == true)|
|`chatThreads`|Integer| The number of threads that will be used for communicating with indexing tasks. Ignored if `chatAsync` is `true` (the default).| no (default == min(10, taskCount * replicas))|
|`chatRetries`|Integer|The number of times HTTP requests to indexing tasks will be retried before considering tasks unresponsive.| no (default == 8)|
|`httpTimeout`|ISO8601 Period|How long to wait for a HTTP response from an indexing task.|no (default == PT10S)|
|`shutdownTimeout`|ISO8601 Period|How long to wait for the supervisor to attempt a graceful shutdown of tasks before exiting.|no (default == PT80S)|
|`recordBufferSize`|Integer|Size of the buffer (number of events) used between the Kinesis fetch threads and the main ingestion thread.|no (see [Determining fetch settings](#determining-fetch-settings) for defaults)|
|`recordBufferOfferTimeout`|Integer|Length of time in milliseconds to wait for space to become available in the buffer before timing out.| no (default == 5000)|
|`recordBufferFullWait`|Integer|Length of time in milliseconds to wait for the buffer to drain before attempting to fetch records from Kinesis again.|no (default == 5000)|
|`fetchThreads`|Integer|Size of the pool of threads fetching data from Kinesis. There is no benefit in having more threads than Kinesis shards.|no (default == procs * 2, where `procs` is the number of processors available to the task)|
|`segmentWriteOutMediumFactory`|Object|Segment write-out medium to use when creating segments. See below for more information.|no (not specified by default, the value from `druid.peon.defaultSegmentWriteOutMediumFactory.type` is used)|
|`intermediateHandoffPeriod`|ISO8601 Period|How often the tasks should hand off segments. Handoff will happen either if `maxRowsPerSegment` or `maxTotalRows` is hit or every `intermediateHandoffPeriod`, whichever happens earlier.| no (default == P2147483647D)|
|`logParseExceptions`|Boolean|If true, log an error message when a parsing exception occurs, containing information about the row where the error occurred.|no, default == false|
|`maxParseExceptions`|Integer|The maximum number of parse exceptions that can occur before the task halts ingestion and fails. Overridden if `reportParseExceptions` is set.|no, unlimited default|
|`maxSavedParseExceptions`|Integer|When a parse exception occurs, Druid can keep track of the most recent parse exceptions. "maxSavedParseExceptions" limits how many exception instances will be saved. These saved exceptions will be made available after the task finishes in the [task completion report](../../ingestion/tasks.md#task-reports). Overridden if `reportParseExceptions` is set.|no, default == 0|
|`maxRecordsPerPoll`|Integer|The maximum number of records/events to be fetched from buffer per poll. The actual maximum will be `Max(maxRecordsPerPoll, Max(bufferSize, 1))`|no (see [Determining fetch settings](#determining-fetch-settings) for defaults)|
|`repartitionTransitionDuration`|ISO8601 period|When shards are split or merged, the supervisor recomputes shard to task group mappings. The supervisor also signals any running tasks created under the old mappings to stop early at (current time + `repartitionTransitionDuration`). Stopping the tasks early allows Druid to begin reading from the new shards more quickly. The repartition transition wait time controlled by this property gives the stream additional time to write records to the new shards after the split or merge, which helps avoid issues with [empty shard handling](https://github.com/apache/druid/issues/7600).|no, (default == PT2M)|
|`offsetFetchPeriod`|ISO8601 period|How often the supervisor queries Kinesis and the indexing tasks to fetch current offsets and calculate lag. If the user-specified value is below the minimum value (`PT5S`), the supervisor ignores the value and uses the minimum value instead.|no (default == PT30S, min == PT5S)|
|`useListShards`|Boolean|Indicates if `listShards` API of AWS Kinesis SDK can be used to prevent `LimitExceededException` during ingestion. Please note that the necessary `IAM` permissions must be set for this to work.|no (default == false)|
|Property|Type|Description|Required|Default|
|--------|----|-----------|--------|-------|
|`type`|String|The indexing task type. This should always be `kinesis`.|Yes||
|`maxRowsInMemory`|Integer|The number of rows to aggregate before persisting. This number represents the post-aggregation rows. It is not equivalent to the number of input events, but the resulting number of aggregated rows. Druid uses `maxRowsInMemory` to manage the required JVM heap size. The maximum heap memory usage for indexing scales is `maxRowsInMemory * (2 + maxPendingPersists)`.|No|100000|
|`maxBytesInMemory`|Long| The number of bytes to aggregate in heap memory before persisting. This is based on a rough estimate of memory usage and not actual usage. Normally, this is computed internally. The maximum heap memory usage for indexing is `maxBytesInMemory * (2 + maxPendingPersists)`.|No|One-sixth of max JVM memory|
|`skipBytesInMemoryOverheadCheck`|Boolean|The calculation of `maxBytesInMemory` takes into account overhead objects created during ingestion and each intermediate persist. To exclude the bytes of these overhead objects from the `maxBytesInMemory` check, set `skipBytesInMemoryOverheadCheck` to `true`.|No|`false`|
|`maxRowsPerSegment`|Integer|The number of rows to aggregate into a segment; this number represents the post-aggregation rows. Handoff occurs when `maxRowsPerSegment` or `maxTotalRows` is reached or every `intermediateHandoffPeriod`, whichever happens first.|No|5000000|
|`maxTotalRows`|Long|The number of rows to aggregate across all segments; this number represents the post-aggregation rows. Handoff occurs when `maxRowsPerSegment` or `maxTotalRows` is reached or every `intermediateHandoffPeriod`, whichever happens first.|No|unlimited|
|`intermediateHandoffPeriod`|ISO 8601 period|The period that determines how often tasks hand off segments. Handoff occurs if `maxRowsPerSegment` or `maxTotalRows` is reached or every `intermediateHandoffPeriod`, whichever happens first.|No|P2147483647D|
|`intermediatePersistPeriod`|ISO 8601 period|The period that determines the rate at which intermediate persists occur.|No|PT10M|
|`maxPendingPersists`|Integer|Maximum number of persists that can be pending but not started. If a new intermediate persist exceeds this limit, Druid blocks ingestion until the currently running persist finishes. One persist can be running concurrently with ingestion, and none can be queued up. The maximum heap memory usage for indexing scales is `maxRowsInMemory * (2 + maxPendingPersists)`.|No|0|
|`indexSpec`|Object|Defines how Druid indexes the data. See [IndexSpec](#indexspec) for more information.|No||
|`indexSpecForIntermediatePersists`|Object|Defines segment storage format options to use at indexing time for intermediate persisted temporary segments. You can use `indexSpecForIntermediatePersists` to disable dimension/metric compression on intermediate segments to reduce memory required for final merging. However, disabling compression on intermediate segments might increase page cache use while they are used before getting merged into final segment published. See [IndexSpec](#indexspec) for possible values.|No|Same as `indexSpec`|
|`reportParseExceptions`|Boolean|If `true`, Druid throws exceptions encountered during parsing causing ingestion to halt. If `false`, Druid skips unparseable rows and fields.|No|`false`|
|`handoffConditionTimeout`|Long|Number of milliseconds to wait for segment handoff. Set to a value >= 0, where 0 means to wait indefinitely.|No|0|
|`resetOffsetAutomatically`|Boolean|Controls behavior when Druid needs to read Kinesis messages that are no longer available.<br/>If `false`, the exception bubbles up causing tasks to fail and ingestion to halt. If this occurs, manual intervention is required to correct the situation, potentially using the [Reset Supervisor API](../../api-reference/supervisor-api.md). This mode is useful for production, since it highlights issues with ingestion.<br/>If `true`, Druid automatically resets to the earliest or latest sequence number available in Kinesis, based on the value of the `useEarliestSequenceNumber` property (earliest if `true`, latest if `false`). Note that this can lead to dropping data (if `useEarliestSequenceNumber` is `false`) or duplicating data (if `useEarliestSequenceNumber` is `true`) without your knowledge. Druid logs messages indicating that a reset has occurred without interrupting ingestion. This mode is useful for non-production situations since it enables Druid to recover from problems automatically, even if they lead to quiet dropping or duplicating of data.|No|`false`|
|`skipSequenceNumberAvailabilityCheck`|Boolean|Whether to enable checking if the current sequence number is still available in a particular Kinesis shard. If `false`, the indexing task attempts to reset the current sequence number, depending on the value of `resetOffsetAutomatically`.|No|`false`|
|`workerThreads`|Integer|The number of threads that the supervisor uses to handle requests/responses for worker tasks, along with any other internal asynchronous operation.|No| `min(10, taskCount)`|
|`chatRetries`|Integer|The number of times Druid retries HTTP requests to indexing tasks before considering tasks unresponsive.|No|8|
|`httpTimeout`|ISO 8601 period|The period of time to wait for a HTTP response from an indexing task.|No|PT10S|
|`shutdownTimeout`|ISO 8601 period|The period of time to wait for the supervisor to attempt a graceful shutdown of tasks before exiting.|No|PT80S|
|`recordBufferSize`|Integer|The size of the buffer (number of events) Druid uses between the Kinesis fetch threads and the main ingestion thread.|No|See [Determine fetch settings](#determine-fetch-settings) for defaults.|
|`recordBufferOfferTimeout`|Integer|The number of milliseconds to wait for space to become available in the buffer before timing out.|No|5000|
|`recordBufferFullWait`|Integer|The number of milliseconds to wait for the buffer to drain before Druid attempts to fetch records from Kinesis again.|No|5000|
|`fetchThreads`|Integer|The size of the pool of threads fetching data from Kinesis. There is no benefit in having more threads than Kinesis shards.|No| `procs * 2`, where `procs` is the number of processors available to the task.|
|`segmentWriteOutMediumFactory`|Object|The segment write-out medium to use when creating segments See [Additional Peon configuration: SegmentWriteOutMediumFactory](../../configuration/index.md#segmentwriteoutmediumfactory) for explanation and available options.|No|If not specified, Druid uses the value from `druid.peon.defaultSegmentWriteOutMediumFactory.type`.|
|`logParseExceptions`|Boolean|If `true`, Druid logs an error message when a parsing exception occurs, containing information about the row where the error occurred.|No|`false`|
|`maxParseExceptions`|Integer|The maximum number of parse exceptions that can occur before the task halts ingestion and fails. Overridden if `reportParseExceptions` is set.|No|unlimited|
|`maxSavedParseExceptions`|Integer|When a parse exception occurs, Druid keeps track of the most recent parse exceptions. `maxSavedParseExceptions` limits the number of saved exception instances. These saved exceptions are available after the task finishes in the [task completion report](../../ingestion/tasks.md#task-reports). Overridden if `reportParseExceptions` is set.|No|0|
|`maxRecordsPerPoll`|Integer|The maximum number of records to be fetched from buffer per poll. The actual maximum will be `Max(maxRecordsPerPoll, Max(bufferSize, 1))`.|No| See [Determine fetch settings](#determine-fetch-settings) for defaults.|
|`repartitionTransitionDuration`|ISO 8601 period|When shards are split or merged, the supervisor recomputes shard to task group mappings. The supervisor also signals any running tasks created under the old mappings to stop early at current time + `repartitionTransitionDuration`. Stopping the tasks early allows Druid to begin reading from the new shards more quickly. The repartition transition wait time controlled by this property gives the stream additional time to write records to the new shards after the split or merge, which helps avoid issues with [empty shard handling](https://github.com/apache/druid/issues/7600).|No|PT2M|
|`offsetFetchPeriod`|ISO 8601 period|Determines how often the supervisor queries Kinesis and the indexing tasks to fetch current offsets and calculate lag. If the user-specified value is below the minimum value of PT5S, the supervisor ignores the value and uses the minimum value instead.|No|PT30S|
|`useListShards`|Boolean|Indicates if `listShards` API of AWS Kinesis SDK can be used to prevent `LimitExceededException` during ingestion. You must set the necessary `IAM` permissions.|No|`false`|
#### IndexSpec
### IndexSpec
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|bitmap|Object|Compression format for bitmap indexes. Should be a JSON object. See [Bitmap types](#bitmap-types) below for options.|no (defaults to Roaring)|
|dimensionCompression|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|no (default == `LZ4`)|
|metricCompression|String|Compression format for primitive type metric columns. Choose from `LZ4`, `LZF`, `uncompressed`, or `none`.|no (default == `LZ4`)|
|longEncoding|String|Encoding format for metric and dimension columns with type long. Choose from `auto` or `longs`. `auto` encodes the values using sequence number or lookup table depending on column cardinality, and store them with variable size. `longs` stores the value as is with 8 bytes each.|no (default == `longs`)|
The following table outlines the configuration options for `indexSpec`:
##### Bitmap types
For Roaring bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`type`|String|Must be `roaring`.|yes|
For Concise bitmaps:
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`type`|String|Must be `concise`.|yes|
#### SegmentWriteOutMediumFactory
|Field|Type|Description|Required|
|-----|----|-----------|--------|
|`type`|String|See [Additional Peon Configuration: SegmentWriteOutMediumFactory](../../configuration/index.md#segmentwriteoutmediumfactory) for explanation and available options.|yes|
|Property|Type|Description|Required|Default|
|--------|----|-----------|--------|-------|
|`bitmap`|Object|Compression format for bitmap indexes. Druid supports roaring and concise bitmap types.|No|Roaring|
|`dimensionCompression`|String|Compression format for dimension columns. Choose from `LZ4`, `LZF`, or `uncompressed`.|No|`LZ4`|
|`metricCompression`|String|Compression format for primitive type metric columns. Choose from `LZ4`, `LZF`, `uncompressed`, or `none`.|No|`LZ4`|
|`longEncoding`|String|Encoding format for metric and dimension columns with type long. Choose from `auto` or `longs`. `auto` encodes the values using sequence number or lookup table depending on column cardinality and stores them with variable sizes. `longs` stores the value as is with 8 bytes each.|No|`longs`|
## Operations
This section describes how some supervisor APIs work in Kinesis Indexing Service.
For all supervisor APIs, check [Supervisor API reference](../../api-reference/supervisor-api.md).
This section describes how to use the [Supervisor API](../../api-reference/supervisor-api.md) with the Kinesis indexing service.
### AWS Authentication
### AWS authentication
To authenticate with AWS, you must provide your AWS access key and AWS secret key via `runtime.properties`, for example:
To authenticate with AWS, you must provide your AWS access key and AWS secret key using `runtime.properties`, for example:
```text
-Ddruid.kinesis.accessKey=123 -Ddruid.kinesis.secretKey=456
druid.kinesis.accessKey=AKIAWxxxxxxxxxx4NCKS
druid.kinesis.secretKey=Jbytxxxxxxxxxxx2+555
```
The AWS access key ID and secret access key are used for Kinesis API requests. If this is not provided, the service will
look for credentials set in environment variables, via [Web Identity Token](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_providers_oidc.html), in the default profile configuration file, and from the EC2 instance
profile provider (in this order).
Druid uses the AWS access key and AWS secret key to authenticate Kinesis API requests. If not provided, the service looks for credentials set in environment variables, via [Web Identity Token](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_providers_oidc.html), in the default profile configuration file, and from the EC2 instance profile provider (in this order).
To ingest data from Kinesis, ensure that the policy attached to your IAM role contains the necessary permissions.
The required permissions depend on the value of `useListShards`.
If the `useListShards` flag is set to `true`, you need following permissions:
- `ListStreams`: required to list your data streams
- `Get*`: required for `GetShardIterator`
- `GetRecords`: required to get data records from a data stream's shard
- `ListShards` : required to get the shards for a stream of interest
- `ListStreams` to list your data streams.
- `Get*` required for `GetShardIterator`.
- `GetRecords` to get data records from a data stream's shard.
- `ListShards` to get the shards for a stream of interest.
**Example policy**
The following is an example policy:
```json
[
@ -381,12 +466,12 @@ If the `useListShards` flag is set to `true`, you need following permissions:
If the `useListShards` flag is set to `false`, you need following permissions:
- `ListStreams`: required to list your data streams
- `Get*`: required for `GetShardIterator`
- `GetRecords`: required to get data records from a data stream's shard
- `DescribeStream`: required to describe the specified data stream
- `ListStreams` to list your data streams.
- `Get*` required for `GetShardIterator`.
- `GetRecords` to get data records from a data stream's shard.
- `DescribeStream` to describe the specified data stream.
**Example policy**
The following is an example policy:
```json
[
@ -408,128 +493,106 @@ If the `useListShards` flag is set to `false`, you need following permissions:
]
```
### Getting Supervisor Status Report
### Get supervisor status report
`GET /druid/indexer/v1/supervisor/<supervisorId>/status` returns a snapshot report of the current state of the tasks
managed by the given supervisor. This includes the latest sequence numbers as reported by Kinesis. Unlike the Kafka
Indexing Service, Kinesis reports lag metrics measured in time difference in milliseconds between the current sequence number and latest sequence number, rather than message count.
To retrieve the current status report for a single supervisor, send a `GET` request to the `/druid/indexer/v1/supervisor/:supervisorId/status` endpoint.
The status report also contains the supervisor's state and a list of recently thrown exceptions (reported as
`recentErrors`, whose max size can be controlled using the `druid.supervisor.maxStoredExceptionEvents` configuration).
There are two fields related to the supervisor's state - `state` and `detailedState`. The `state` field will always be
one of a small number of generic states that apply to any type of supervisor, while the `detailedState` field
will contain a more descriptive, implementation-specific state that may provide more insight into the supervisor's
activities than the generic `state` field.
The report contains the state of the supervisor tasks, the latest sequence numbers, and an array of recently thrown exceptions reported as `recentErrors`. You can control the maximum size of the exceptions using the `druid.supervisor.maxStoredExceptionEvents` configuration.
The list of possible `state` values are: [`PENDING`, `RUNNING`, `SUSPENDED`, `STOPPING`, `UNHEALTHY_SUPERVISOR`, `UNHEALTHY_TASKS`]
The two properties related to the supervisor's state are `state` and `detailedState`. The `state` property contains a small number of generic states that apply to any type of supervisor, while the `detailedState` property contains a more descriptive, implementation-specific state that may provide more insight into the supervisor's activities.
The list of `detailedState` values and their corresponding `state` mapping is as follows:
Possible `state` values are `PENDING`, `RUNNING`, `SUSPENDED`, `STOPPING`, `UNHEALTHY_SUPERVISOR`, and `UNHEALTHY_TASKS`.
|Detailed State|Corresponding State|Description|
The following table lists `detailedState` values and their corresponding `state` mapping:
|Detailed state|Corresponding state|Description|
|--------------|-------------------|-----------|
|UNHEALTHY_SUPERVISOR|UNHEALTHY_SUPERVISOR|The supervisor has encountered errors on the past `druid.supervisor.unhealthinessThreshold` iterations|
|UNHEALTHY_TASKS|UNHEALTHY_TASKS|The last `druid.supervisor.taskUnhealthinessThreshold` tasks have all failed|
|UNABLE_TO_CONNECT_TO_STREAM|UNHEALTHY_SUPERVISOR|The supervisor is encountering connectivity issues with Kinesis and has not successfully connected in the past|
|LOST_CONTACT_WITH_STREAM|UNHEALTHY_SUPERVISOR|The supervisor is encountering connectivity issues with Kinesis but has successfully connected in the past|
|PENDING (first iteration only)|PENDING|The supervisor has been initialized and hasn't started connecting to the stream|
|CONNECTING_TO_STREAM (first iteration only)|RUNNING|The supervisor is trying to connect to the stream and update partition data|
|DISCOVERING_INITIAL_TASKS (first iteration only)|RUNNING|The supervisor is discovering already-running tasks|
|CREATING_TASKS (first iteration only)|RUNNING|The supervisor is creating tasks and discovering state|
|RUNNING|RUNNING|The supervisor has started tasks and is waiting for taskDuration to elapse|
|SUSPENDED|SUSPENDED|The supervisor has been suspended|
|STOPPING|STOPPING|The supervisor is stopping|
|`UNHEALTHY_SUPERVISOR`|`UNHEALTHY_SUPERVISOR`|The supervisor encountered errors on previous `druid.supervisor.unhealthinessThreshold` iterations.|
|`UNHEALTHY_TASKS`|`UNHEALTHY_TASKS`|The last `druid.supervisor.taskUnhealthinessThreshold` tasks all failed.|
|`UNABLE_TO_CONNECT_TO_STREAM`|`UNHEALTHY_SUPERVISOR`|The supervisor is encountering connectivity issues with Kinesis and has not successfully connected in the past.|
|`LOST_CONTACT_WITH_STREAM`|`UNHEALTHY_SUPERVISOR`|The supervisor is encountering connectivity issues with Kinesis but has successfully connected in the past.|
|`PENDING` (first iteration only)|`PENDING`|The supervisor has been initialized but hasn't started connecting to the stream.|
|`CONNECTING_TO_STREAM` (first iteration only)|`RUNNING`|The supervisor is trying to connect to the stream and update partition data.|
|`DISCOVERING_INITIAL_TASKS` (first iteration only)|`RUNNING`|The supervisor is discovering already-running tasks.|
|`CREATING_TASKS` (first iteration only)|`RUNNING`|The supervisor is creating tasks and discovering state.|
|`RUNNING`|`RUNNING`|The supervisor has started tasks and is waiting for `taskDuration` to elapse.|
|`SUSPENDED`|`SUSPENDED`|The supervisor is suspended.|
|`STOPPING`|`STOPPING`|The supervisor is stopping.|
On each iteration of the supervisor's run loop, the supervisor completes the following tasks in sequence:
1) Fetch the list of shards from Kinesis and determine the starting sequence number for each shard (either based on the
last processed sequence number if continuing, or starting from the beginning or ending of the stream if this is a new stream).
2) Discover any running indexing tasks that are writing to the supervisor's datasource and adopt them if they match
the supervisor's configuration, else signal them to stop.
3) Send a status request to each supervised task to update our view of the state of the tasks under our supervision.
4) Handle tasks that have exceeded `taskDuration` and should transition from the reading to publishing state.
5) Handle tasks that have finished publishing and signal redundant replica tasks to stop.
6) Handle tasks that have failed and clean up the supervisor's internal state.
7) Compare the list of healthy tasks to the requested `taskCount` and `replicas` configurations and create additional tasks if required.
1. Fetch the list of shards from Kinesis and determine the starting sequence number for each shard (either based on the last processed sequence number if continuing, or starting from the beginning or ending of the stream if this is a new stream).
2. Discover any running indexing tasks that are writing to the supervisor's datasource and adopt them if they match the supervisor's configuration, else signal them to stop.
3. Send a status request to each supervised task to update the view of the state of the tasks under supervision.
4. Handle tasks that have exceeded `taskDuration` and should transition from the reading to publishing state.
5. Handle tasks that have finished publishing and signal redundant replica tasks to stop.
6. Handle tasks that have failed and clean up the supervisor's internal state.
7. Compare the list of healthy tasks to the requested `taskCount` and `replicas` configurations and create additional tasks if required.
The `detailedState` field will show additional values (those marked with "first iteration only") the first time the
The `detailedState` property shows additional values (marked with "first iteration only" in the preceding table) the first time the
supervisor executes this run loop after startup or after resuming from a suspension. This is intended to surface
initialization-type issues, where the supervisor is unable to reach a stable state (perhaps because it can't connect to
Kinesis, it can't read from the stream, or it can't communicate with existing tasks). Once the supervisor is stable -
that is, once it has completed a full execution without encountering any issues - `detailedState` will show a `RUNNING`
initialization-type issues, where the supervisor is unable to reach a stable state. For example, if the supervisor cannot connect to
Kinesis, if it's unable to read from the stream, or cannot communicate with existing tasks. Once the supervisor is stable;
that is, once it has completed a full execution without encountering any issues, `detailedState` will show a `RUNNING`
state until it is stopped, suspended, or hits a failure threshold and transitions to an unhealthy state.
### Updating Existing Supervisors
### Update existing supervisors
`POST /druid/indexer/v1/supervisor` can be used to update existing supervisor spec.
Calling this endpoint when there is already an existing supervisor for the same dataSource will cause:
To update an existing supervisor spec, send a `POST` request to the `/druid/indexer/v1/supervisor` endpoint.
- The running supervisor to signal its managed tasks to stop reading and begin publishing.
- The running supervisor to exit.
- A new supervisor to be created using the configuration provided in the request body. This supervisor will retain the
existing publishing tasks and will create new tasks starting at the sequence numbers the publishing tasks ended on.
When you call this endpoint on an existing supervisor for the same datasource, the running supervisor signals its tasks to stop reading and begin publishing their segments, exiting itself. Druid then uses the provided configuration from the request body to create a new supervisor with a new set of tasks that start reading from the sequence numbers, where the previous now-publishing tasks left off, but using the updated schema.
In this way, configuration changes can be applied without requiring any pause in ingestion.
Seamless schema migrations can thus be achieved by simply submitting the new schema using this endpoint.
You can achieve seamless schema migrations by submitting the new schema using the `/druid/indexer/v1/supervisor` endpoint.
### Suspending and Resuming Supervisors
### Suspend and resume a supervisor
You can suspend and resume a supervisor using `POST /druid/indexer/v1/supervisor/<supervisorId>/suspend` and `POST /druid/indexer/v1/supervisor/<supervisorId>/resume`, respectively.
To suspend a supervisor, send a `POST` request to the `/druid/indexer/v1/supervisor/:supervisorId/suspend` endpoint.
Suspending a supervisor does not prevent it from operating and emitting logs and metrics. It ensures that no indexing tasks are running until the supervisor resumes.
Note that the supervisor itself will still be operating and emitting logs and metrics,
it will just ensure that no indexing tasks are running until the supervisor is resumed.
To resume a supervisor, send a `POST` request to the `/druid/indexer/v1/supervisor/:supervisorId/resume` endpoint.
### Resetting Supervisors
### Reset a supervisor
The `POST /druid/indexer/v1/supervisor/<supervisorId>/reset` operation clears stored
sequence numbers, causing the supervisor to start reading from either the earliest or
The supervisor must be running for this endpoint to be available
To reset a supervisor, send a `POST` request to the `/druid/indexer/v1/supervisor/:supervisorId/reset` endpoint. This endpoint clears stored
sequence numbers, prompting the supervisor to start reading from either the earliest or the
latest sequence numbers in Kinesis (depending on the value of `useEarliestSequenceNumber`).
After clearing stored sequence numbers, the supervisor kills and recreates active tasks,
so that tasks begin reading from valid sequence numbers.
Use care when using this operation! Resetting the supervisor may cause Kinesis messages
to be skipped or read twice, resulting in missing or duplicate data.
This endpoint is useful when you need to recover from a stopped state due to missing sequence numbers in Kinesis.
Use this endpoint with caution as it may result in skipped messages, leading to data loss or duplicate data.
The reason for using this operation is to recover from a state in which the supervisor
ceases operating due to missing sequence numbers. The indexing service keeps track of the latest
The indexing service keeps track of the latest
persisted sequence number to provide exactly-once ingestion guarantees across
tasks.
Subsequent tasks must start reading from where the previous task completed
for the generated segments to be accepted. If the messages at the expected starting sequence numbers are
no longer available in Kinesis (typically because the message retention period has elapsed or the topic was
removed and re-created) the supervisor will refuse to start and in-flight tasks will fail. This operation
enables you to recover from this condition.
removed and re-created) the supervisor will refuse to start and in-flight tasks will fail. This endpoint enables you to recover from this condition.
Note that the supervisor must be running for this endpoint to be available.
### Terminate a supervisor
### Terminating Supervisors
To terminate a supervisor and its associated indexing tasks, send a `POST` request to the `/druid/indexer/v1/supervisor/:supervisorId/terminate` endpoint.
This places a tombstone marker in the database to prevent the supervisor from being reloaded on a restart and then gracefully
shuts down the currently running supervisor.
The tasks stop reading and begin publishing their segments immediately.
The call returns after all tasks have been signaled to stop but before the tasks finish publishing their segments.
The `POST /druid/indexer/v1/supervisor/<supervisorId>/terminate` operation terminates a supervisor and causes
all associated indexing tasks managed by this supervisor to immediately stop and begin
publishing their segments. This supervisor will still exist in the metadata store and its history may be retrieved
with the supervisor history API, but will not be listed in the 'get supervisors' API response nor can its configuration
or status report be retrieved. The only way this supervisor can start again is by submitting a functioning supervisor
spec to the create API.
The terminated supervisor continues exists in the metadata store and its history can be retrieved.
The only way to restart a terminated supervisor is by submitting a functioning supervisor spec to `/druid/indexer/v1/supervisor`.
### Capacity Planning
## Capacity planning
Kinesis indexing tasks run on MiddleManagers and are thus limited by the resources available in the MiddleManager
cluster. In particular, you should make sure that you have sufficient worker capacity (configured using the
`druid.worker.capacity` property) to handle the configuration in the supervisor spec. Note that worker capacity is
shared across all types of indexing tasks, so you should plan your worker capacity to handle your total indexing load
(e.g. batch processing, streaming tasks, merging tasks, etc.). If your workers run out of capacity, Kinesis indexing tasks
will queue and wait for the next available worker. This may cause queries to return partial results but will not result
in data loss (assuming the tasks run before Kinesis purges those sequence numbers).
Kinesis indexing tasks run on Middle Managers and are limited by the resources available in the Middle Manager cluster. In particular, you should make sure that you have sufficient worker capacity, configured using the
`druid.worker.capacity` property, to handle the configuration in the supervisor spec. Note that worker capacity is
shared across all types of indexing tasks, so you should plan your worker capacity to handle your total indexing load, such as batch processing, streaming tasks, and merging tasks. If your workers run out of capacity, Kinesis indexing tasks queue and wait for the next available worker. This may cause queries to return partial results but will not result in data loss, assuming the tasks run before Kinesis purges those sequence numbers.
A running task will normally be in one of two states: *reading* or *publishing*. A task will remain in reading state for
`taskDuration`, at which point it will transition to publishing state. A task will remain in publishing state for as long
as it takes to generate segments, push segments to deep storage, and have them be loaded and served by a Historical process
(or until `completionTimeout` elapses).
A running task can be in one of two states: reading or publishing. A task remains in reading state for the period defined in `taskDuration`, at which point it transitions to publishing state. A task remains in publishing state for as long as it takes to generate segments, push segments to deep storage, and have them loaded and served by a Historical process or until `completionTimeout` elapses.
The number of reading tasks is controlled by `replicas` and `taskCount`. In general, there will be `replicas * taskCount`
reading tasks, the exception being if taskCount > {numKinesisShards} in which case {numKinesisShards} tasks will
be used instead. When `taskDuration` elapses, these tasks will transition to publishing state and `replicas * taskCount`
new reading tasks will be created. Therefore, to allow for reading tasks and publishing tasks to run concurrently, there
should be a minimum capacity of:
The number of reading tasks is controlled by `replicas` and `taskCount`. In general, there are `replicas * taskCount` reading tasks. An exception occurs if `taskCount > {numKinesisShards}`, in which case Druid uses `{numKinesisShards}` tasks. When `taskDuration` elapses, these tasks transition to publishing state and `replicas * taskCount` new reading tasks are created. To allow for reading tasks and publishing tasks to run concurrently, there should be a minimum capacity of:
```text
workerCapacity = 2 * replicas * taskCount
@ -537,59 +600,29 @@ workerCapacity = 2 * replicas * taskCount
This value is for the ideal situation in which there is at most one set of tasks publishing while another set is reading.
In some circumstances, it is possible to have multiple sets of tasks publishing simultaneously. This would happen if the
time-to-publish (generate segment, push to deep storage, loaded on Historical) > `taskDuration`. This is a valid
scenario (correctness-wise) but requires additional worker capacity to support. In general, it is a good idea to have
`taskDuration` be large enough that the previous set of tasks finishes publishing before the current set begins.
time-to-publish (generate segment, push to deep storage, load on Historical) is greater than `taskDuration`. This is a valid and correct scenario but requires additional worker capacity to support. In general, it is a good idea to have `taskDuration` be large enough that the previous set of tasks finishes publishing before the current set begins.
### Supervisor Persistence
## Shards and segment handoff
When a supervisor spec is submitted via the `POST /druid/indexer/v1/supervisor` endpoint, it is persisted in the
configured metadata database. There can only be a single supervisor per dataSource, and submitting a second spec for
the same dataSource will overwrite the previous one.
Each Kinesis indexing task writes the events it consumes from Kinesis shards into a single segment for the segment granularity interval until it reaches one of the following limits: `maxRowsPerSegment`, `maxTotalRows`, or `intermediateHandoffPeriod`.
At this point, the task creates a new shard for this segment granularity to contain subsequent events.
When an Overlord gains leadership, either by being started or as a result of another Overlord failing, it will spawn
a supervisor for each supervisor spec in the metadata database. The supervisor will then discover running Kinesis indexing
tasks and will attempt to adopt them if they are compatible with the supervisor's configuration. If they are not
compatible because they have a different ingestion spec or shard allocation, the tasks will be killed and the
supervisor will create a new set of tasks. In this way, the supervisors are persistent across Overlord restarts and
fail-overs.
The Kinesis indexing task also performs incremental hand-offs so that the segments created by the task are not held up until the task duration is over.
When the task reaches one of the `maxRowsPerSegment`, `maxTotalRows`, or `intermediateHandoffPeriod` limits, it hands off all the segments and creates a new set of segments for further events. This allows the task to run for longer durations
without accumulating old segments locally on Middle Manager processes.
A supervisor is stopped via the `POST /druid/indexer/v1/supervisor/<supervisorId>/terminate` endpoint. This places a
tombstone marker in the database (to prevent the supervisor from being reloaded on a restart) and then gracefully
shuts down the currently running supervisor. When a supervisor is shut down in this way, it will instruct its
managed tasks to stop reading. The tasks will begin publishing their segments immediately. The call to the shutdown
endpoint will return after all tasks have been signalled to stop but before the tasks finish publishing their segments.
The Kinesis indexing service may still produce some small segments.
For example, consider the following scenario:
### Schema/Configuration Changes
- Task duration is 4 hours
- Segment granularity is set to an HOUR
- The supervisor was started at 9:10
Schema and configuration changes are handled by submitting the new supervisor spec via the same
`POST /druid/indexer/v1/supervisor` endpoint used to initially create the supervisor. The Overlord will initiate a
graceful shutdown of the existing supervisor which will cause the tasks being managed by that supervisor to stop reading
and begin publishing their segments. A new supervisor will then be started which will create a new set of tasks that
will start reading from the sequence numbers where the previous now-publishing tasks left off, but using the updated schema.
In this way, configuration changes can be applied without requiring any pause in ingestion.
After 4 hours at 13:10, Druid starts a new set of tasks. The events for the interval 13:00 - 14:00 may be split across existing tasks and the new set of tasks which could result in small segments. To merge them together into new segments of an ideal size (in the range of ~500-700 MB per segment), you can schedule re-indexing tasks, optionally with a different segment granularity.
### Deployment Notes
For more detail, see [Segment size optimization](../../operations/segment-optimization.md).
#### On the Subject of Segments
Each Kinesis Indexing Task puts events consumed from Kinesis Shards assigned to it in a single segment for each segment
granular interval until maxRowsPerSegment, maxTotalRows or intermediateHandoffPeriod limit is reached. At this point, a new shard
for this segment granularity is created for further events. Kinesis Indexing Task also does incremental hand-offs which
means that all the segments created by a task will not be held up till the task duration is over. As soon as maxRowsPerSegment,
maxTotalRows or intermediateHandoffPeriod limit is hit, all the segments held by the task at that point in time will be handed-off
and new set of segments will be created for further events. This means that the task can run for longer durations of time
without accumulating old segments locally on Middle Manager processes, and it is encouraged to do so.
Kinesis Indexing Service may still produce some small segments. Let's say the task duration is 4 hours, segment granularity
is set to an HOUR and Supervisor was started at 9:10. Then after 4 hours at 13:10, the new set of tasks will be started and
events for the interval 13:00 - 14:00 may be split across the previous and the new set of tasks. If you see it becoming a problem then
one can schedule re-indexing tasks be run to merge segments together into new segments of an ideal size (in the range of ~500-700 MB per segment).
Details on how to optimize the segment size can be found on [Segment size optimization](../../operations/segment-optimization.md).
There is also ongoing work to support automatic segment compaction of sharded segments as well as compaction not requiring
Hadoop (see [here](https://github.com/apache/druid/pull/5102)).
### Determining Fetch Settings
## Determine fetch settings
Kinesis indexing tasks fetch records using `fetchThreads` threads.
If `fetchThreads` is higher than the number of Kinesis shards, the excess threads are unused.
@ -598,7 +631,7 @@ of `fetchDelayMillis`.
The records fetched by each thread are pushed into a shared queue of size `recordBufferSize`.
The main runner thread for each task polls up to `maxRecordsPerPoll` records from the queue at once.
When using Kinesis Producer Library's aggregation feature (i.e. when [`deaggregate`](#deaggregation) is set),
When using Kinesis Producer Library's aggregation feature, that is when [`deaggregate`](#deaggregation) is set,
each of these parameters refers to aggregated records rather than individual records.
The default values for these parameters are:
@ -620,7 +653,7 @@ Kinesis places the following restrictions on calls to fetch records:
- Each shard can read up to 2 MB per second.
- The maximum size of data that GetRecords can return is 10 MB.
If the above limits are exceeded, Kinesis throws ProvisionedThroughputExceededException errors. If this happens, Druid
If the above limits are exceeded, Kinesis throws `ProvisionedThroughputExceededException` errors. If this happens, Druid
Kinesis tasks pause by `fetchDelayMillis` or 3 seconds, whichever is larger, and then attempt the call again.
In most cases, the default settings for fetch parameters are sufficient to achieve good performance without excessive
@ -638,21 +671,21 @@ To enable this feature, set `deaggregate` to true in your `ioConfig` when submit
## Resharding
When changing the shard count for a Kinesis stream, there will be a window of time around the resharding operation with early shutdown of Kinesis ingestion tasks and possible task failures.
When changing the shard count for a Kinesis stream, there is a window of time around the resharding operation with early shutdown of Kinesis ingestion tasks and possible task failures.
The early shutdowns and task failures are expected. They occur because the supervisor updates the shard to task group mappings as shards are closed and fully read. This ensures that tasks are not running
with an assignment of closed shards that have been fully read and balances distribution of active shards across tasks.
This window with early task shutdowns and possible task failures will conclude when:
This window with early task shutdowns and possible task failures concludes when:
- All closed shards have been fully read and the Kinesis ingestion tasks have published the data from those shards, committing the "closed" state to metadata storage
- Any remaining tasks that had inactive shards in the assignment have been shutdown (these tasks would have been created before the closed shards were completely drained)
- All closed shards have been fully read and the Kinesis ingestion tasks have published the data from those shards, committing the "closed" state to metadata storage.
- Any remaining tasks that had inactive shards in the assignment have been shut down. These tasks would have been created before the closed shards were completely drained.
## Kinesis known issues
Before you deploy the Kinesis extension to production, consider the following known issues:
- Avoid implementing more than one Kinesis supervisor that read from the same Kinesis stream for ingestion. Kinesis has a per-shard read throughput limit and having multiple supervisors on the same stream can reduce available read throughput for an individual Supervisor's tasks. Additionally, multiple Supervisors ingesting to the same Druid Datasource can cause increased contention for locks on the Datasource.
- Avoid implementing more than one Kinesis supervisor that reads from the same Kinesis stream for ingestion. Kinesis has a per-shard read throughput limit and having multiple supervisors on the same stream can reduce available read throughput for an individual supervisor's tasks. Multiple supervisors ingesting to the same Druid datasource can also cause increased contention for locks on the datasource.
- The only way to change the stream reset policy is to submit a new ingestion spec and set up a new supervisor.
- If ingestion tasks get stuck, the supervisor does not automatically recover. You should monitor ingestion tasks and investigate if your ingestion falls behind.
- A Kinesis supervisor can sometimes compare the checkpoint offset to retention window of the stream to see if it has fallen behind. These checks fetch the earliest sequence number for Kinesis which can result in `IteratorAgeMilliseconds` becoming very high in AWS CloudWatch.

View File

@ -351,12 +351,12 @@ This config file adds the configs below to enable a custom coordinator duty.
```
druid.coordinator.dutyGroups=["cleanupMetadata"]
druid.coordinator.cleanupMetadata.duties=["killSupervisors"]
druid.coordinator.cleanupMetadata.duty.killSupervisors.retainDuration=PT0M
druid.coordinator.cleanupMetadata.duty.killSupervisors.durationToRetain=PT0M
druid.coordinator.cleanupMetadata.period=PT10S
```
These configurations create a custom coordinator duty group called `cleanupMetadata` which runs a custom coordinator duty called `killSupervisors` every 10 seconds.
The custom coordinator duty `killSupervisors` also has a config called `retainDuration` which is set to 0 minute.
The custom coordinator duty `killSupervisors` also has a config called `durationToRetain` which is set to 0 minute.
### Routing data through a HTTP proxy for your extension

View File

@ -485,7 +485,7 @@ is:
|skipBytesInMemoryOverheadCheck|The calculation of maxBytesInMemory takes into account overhead objects created during ingestion and each intermediate persist. Setting this to true can exclude the bytes of these overhead objects from maxBytesInMemory check.|false|
|indexSpec|Defines segment storage format options to use at indexing time.|See [`indexSpec`](#indexspec) for more information.|
|indexSpecForIntermediatePersists|Defines segment storage format options to use at indexing time for intermediate persisted temporary segments.|See [`indexSpec`](#indexspec) for more information.|
|Other properties|Each ingestion method has its own list of additional tuning properties. See the documentation for each method for a full list: [Kafka indexing service](../development/extensions-core/kafka-supervisor-reference.md#tuningconfig), [Kinesis indexing service](../development/extensions-core/kinesis-ingestion.md#tuningconfig), [Native batch](native-batch.md#tuningconfig), and [Hadoop-based](hadoop.md#tuningconfig).||
|Other properties|Each ingestion method has its own list of additional tuning properties. See the documentation for each method for a full list: [Kafka indexing service](../development/extensions-core/kafka-supervisor-reference.md#tuningconfig), [Kinesis indexing service](../development/extensions-core/kinesis-ingestion.md#supervisor-tuning-configuration), [Native batch](native-batch.md#tuningconfig), and [Hadoop-based](hadoop.md#tuningconfig).||
### `indexSpec`

View File

@ -794,6 +794,196 @@ The following is an example of a Combining input source spec:
...
```
## Iceberg input source
> To use the Iceberg input source, add the `druid-iceberg-extensions` extension.
You use the Iceberg input source to read data stored in the Iceberg table format. For a given table, the input source scans up to the latest Iceberg snapshot from the configured Hive catalog. Druid ingests the underlying live data files using the existing input source formats.
The Iceberg input source cannot be independent as it relies on the existing input sources to read from the data files.
For example, if the warehouse associated with an Iceberg catalog is on S3, you must also load the [`druid-s3-extensions`](../development/extensions-core/s3.md) extension.
The following is a sample spec for a HDFS warehouse source:
```json
...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "iceberg",
"tableName": "iceberg_table",
"namespace": "iceberg_namespace",
"icebergCatalog": {
"type": "hive",
"warehousePath": "hdfs://warehouse/path",
"catalogUri": "thrift://hive-metastore.x.com:8970",
"catalogProperties": {
"hive.metastore.connect.retries": "1",
"hive.metastore.execute.setugi": "false",
"hive.metastore.kerberos.principal": "KRB_PRINCIPAL",
"hive.metastore.sasl.enabled": "true",
"metastore.catalog.default": "catalog_test",
"hadoop.security.authentication": "kerberos",
"hadoop.security.authorization": "true"
}
},
"icebergFilter": {
"type": "interval",
"filterColumn": "event_time",
"intervals": [
"2023-05-10T19:00:00.000Z/2023-05-10T20:00:00.000Z"
]
},
"warehouseSource": {
"type": "hdfs"
}
},
"inputFormat": {
"type": "parquet"
}
},
...
},
...
```
The following is a sample spec for a S3 warehouse source:
```json
...
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "iceberg",
"tableName": "iceberg_table",
"namespace": "iceberg_namespace",
"icebergCatalog": {
"type": "hive",
"warehousePath": "hdfs://warehouse/path",
"catalogUri": "thrift://hive-metastore.x.com:8970",
"catalogProperties": {
"hive.metastore.connect.retries": "1",
"hive.metastore.execute.setugi": "false",
"hive.metastore.kerberos.principal": "KRB_PRINCIPAL",
"hive.metastore.sasl.enabled": "true",
"metastore.catalog.default": "default_catalog",
"fs.s3a.access.key" : "S3_ACCESS_KEY",
"fs.s3a.secret.key" : "S3_SECRET_KEY",
"fs.s3a.endpoint" : "S3_API_ENDPOINT"
}
},
"icebergFilter": {
"type": "interval",
"filterColumn": "event_time",
"intervals": [
"2023-05-10T19:00:00.000Z/2023-05-10T20:00:00.000Z"
]
},
"warehouseSource": {
"type": "s3",
"endpointConfig": {
"url": "teststore.aws.com",
"signingRegion": "us-west-2a"
},
"clientConfig": {
"protocol": "http",
"disableChunkedEncoding": true,
"enablePathStyleAccess": true,
"forceGlobalBucketAccessEnabled": false
},
"properties": {
"accessKeyId": {
"type": "default",
"password": "foo"
},
"secretAccessKey": {
"type": "default",
"password": "bar"
}
},
}
},
"inputFormat": {
"type": "parquet"
}
},
...
},
```
|Property|Description|Required|
|--------|-----------|---------|
|type|Set the value to `iceberg`.|yes|
|tableName|The Iceberg table name configured in the catalog.|yes|
|namespace|The Iceberg namespace associated with the table|yes|
|icebergFilter|The JSON Object that filters data files within a snapshot|no|
|icebergCatalog|The JSON Object used to define the catalog that manages the configured Iceberg table|yes|
|warehouseSource|The JSON Object that defines the native input source for reading the data files from the warehouse|yes|
###Catalog Object
The catalog object supports `local` and `hive` catalog types.
The following table lists the properties of a `local` catalog:
|Property|Description|Required|
|--------|-----------|---------|
|type|Set this value to `local`.|yes|
|warehousePath|The location of the warehouse associated with the catalog|yes|
|catalogProperties|Map of any additional properties that needs to be attached to the catalog|no|
The following table lists the properties of a `hive` catalog:
|Property|Description|Required|
|--------|-----------|---------|
|type|Set this value to `hive`.|yes|
|warehousePath|The location of the warehouse associated with the catalog|yes|
|catalogUri|The URI associated with the hive catalog|yes|
|catalogProperties|Map of any additional properties that needs to be attached to the catalog|no|
### Iceberg filter object
This input source provides the following filters: `and`, `equals`, `interval`, and `or`. You can use these filters to filter out data files from a snapshot, reducing the number of files Druid has to ingest.
`equals` Filter:
|Property|Description|Required|
|--------|-----------|---------|
|type|Set this value to `equals`.|yes|
|filterColumn|The name of the column from the Iceberg table schema to use for filtering.|yes|
|filterValue|The value to filter on.|yes|
`interval` Filter:
|Property|Description|Required|
|--------|-----------|---------|
|type|Set this value to `interval`.|yes|
|filterColumn|The column name from the iceberg table schema based on which filtering needs to happen|yes|
|intervals|A JSON array containing ISO 8601 interval strings. This defines the time ranges to filter on. The start interval is inclusive and the end interval is exclusive. |yes|
`and` Filter:
|Property|Description|Required|
|--------|-----------|---------|
|type|Set this value to `and`.|yes|
|filters|List of iceberg filters that needs to be AND-ed|yes|
`or` Filter:
|Property|Description|Required|
|--------|-----------|---------|
|type|Set this value to `or`.|yes|
|filters|List of iceberg filters that needs to be OR-ed|yes|
`not` Filter:
|Property|Description|Required|
|--------|-----------|---------|
|type|Set this value to `not`.|yes|
|filter|The iceberg filter on which logical NOT is applied|yes|
The [secondary partitioning method](native-batch.md#partitionsspec) determines the requisite number of concurrent worker tasks that run in parallel to complete ingestion with the Combining input source.
Set this value in `maxNumConcurrentSubTasks` in `tuningConfig` based on the secondary partitioning method:
- `range` or `single_dim` partitioning: greater than or equal to 1

View File

@ -203,7 +203,8 @@ For more information about partitioning, see [Partitioning](concepts.md#partitio
### `CLUSTERED BY`
The `CLUSTERED BY <column list>` clause is optional for [INSERT](#insert) and [REPLACE](#replace). It accepts a list of
column names or expressions.
column names or expressions. Druid's segment generation only supports ascending order, so an `INSERT` or `REPLACE` query with
`CLUSTERED BY` columns in `DESC` ordering is not allowed.
For more information about clustering, see [Clustering](concepts.md#clustering).
@ -234,7 +235,7 @@ The following table lists the context parameters for the MSQ task engine:
| `maxNumTasks` | SELECT, INSERT, REPLACE<br /><br />The maximum total number of tasks to launch, including the controller task. The lowest possible value for this setting is 2: one controller and one worker. All tasks must be able to launch simultaneously. If they cannot, the query returns a `TaskStartTimeout` error code after approximately 10 minutes.<br /><br />May also be provided as `numTasks`. If both are present, `maxNumTasks` takes priority. | 2 |
| `taskAssignment` | SELECT, INSERT, REPLACE<br /><br />Determines how many tasks to use. Possible values include: <ul><li>`max`: Uses as many tasks as possible, up to `maxNumTasks`.</li><li>`auto`: When file sizes can be determined through directory listing (for example: local files, S3, GCS, HDFS) uses as few tasks as possible without exceeding 512 MiB or 10,000 files per task, unless exceeding these limits is necessary to stay within `maxNumTasks`. When calculating the size of files, the weighted size is used, which considers the file format and compression format used if any. When file sizes cannot be determined through directory listing (for example: http), behaves the same as `max`.</li></ul> | `max` |
| `finalizeAggregations` | SELECT, INSERT, REPLACE<br /><br />Determines the type of aggregation to return. If true, Druid finalizes the results of complex aggregations that directly appear in query results. If false, Druid returns the aggregation's intermediate type rather than finalized type. This parameter is useful during ingestion, where it enables storing sketches directly in Druid tables. For more information about aggregations, see [SQL aggregation functions](../querying/sql-aggregations.md). | true |
| `sqlJoinAlgorithm` | SELECT, INSERT, REPLACE<br /><br />Algorithm to use for JOIN. Use `broadcast` (the default) for broadcast hash join or `sortMerge` for sort-merge join. Affects all JOIN operations in the query. See [Joins](#joins) for more details. | `broadcast` |
| `sqlJoinAlgorithm` | SELECT, INSERT, REPLACE<br /><br />Algorithm to use for JOIN. Use `broadcast` (the default) for broadcast hash join or `sortMerge` for sort-merge join. Affects all JOIN operations in the query. This is a hint to the MSQ engine and the actual joins in the query may proceed in a different way than specified. See [Joins](#joins) for more details. | `broadcast` |
| `rowsInMemory` | INSERT or REPLACE<br /><br />Maximum number of rows to store in memory at once before flushing to disk during the segment generation process. Ignored for non-INSERT queries. In most cases, use the default value. You may need to override the default if you run into one of the [known issues](./known-issues.md) around memory usage. | 100,000 |
| `segmentSortOrder` | INSERT or REPLACE<br /><br />Normally, Druid sorts rows in individual segments using `__time` first, followed by the [CLUSTERED BY](#clustered-by) clause. When you set `segmentSortOrder`, Druid sorts rows in segments using this column list first, followed by the CLUSTERED BY order.<br /><br />You provide the column list as comma-separated values or as a JSON array in string form. If your query includes `__time`, then this list must begin with `__time`. For example, consider an INSERT query that uses `CLUSTERED BY country` and has `segmentSortOrder` set to `__time,city`. Within each time chunk, Druid assigns rows to segments based on `country`, and then within each of those segments, Druid sorts those rows by `__time` first, then `city`, then `country`. | empty list |
| `maxParseExceptions`| SELECT, INSERT, REPLACE<br /><br />Maximum number of parse exceptions that are ignored while executing the query before it stops with `TooManyWarningsFault`. To ignore all the parse exceptions, set the value to -1. | 0 |
@ -242,6 +243,7 @@ The following table lists the context parameters for the MSQ task engine:
| `indexSpec` | INSERT or REPLACE<br /><br />An [`indexSpec`](../ingestion/ingestion-spec.md#indexspec) to use when generating segments. May be a JSON string or object. See [Front coding](../ingestion/ingestion-spec.md#front-coding) for details on configuring an `indexSpec` with front coding. | See [`indexSpec`](../ingestion/ingestion-spec.md#indexspec). |
| `durableShuffleStorage` | SELECT, INSERT, REPLACE <br /><br />Whether to use durable storage for shuffle mesh. To use this feature, configure the durable storage at the server level using `druid.msq.intermediate.storage.enable=true`). If these properties are not configured, any query with the context variable `durableShuffleStorage=true` fails with a configuration error. <br /><br /> | `false` |
| `faultTolerance` | SELECT, INSERT, REPLACE<br /><br /> Whether to turn on fault tolerance mode or not. Failed workers are retried based on [Limits](#limits). Cannot be used when `durableShuffleStorage` is explicitly set to false. | `false` |
| `selectDestination` | SELECT<br /><br /> Controls where the final result of the select query is written. <br />Use `taskReport`(the default) to write select results to the task report. <b> This is not scalable since task reports size explodes for large results </b> <br/>Use `durableStorage` to write results to durable storage location. <b>For large results sets, its recommended to use `durableStorage` </b>. To configure durable storage see [`this`](#durable-storage) section. | `taskReport` |
## Joins
@ -253,6 +255,12 @@ Joins in multi-stage queries use one of two algorithms based on what you set the
If you omit this context parameter, the MSQ task engine uses broadcast since it's the default join algorithm. The context parameter applies to the entire SQL statement, so you can't mix different
join algorithms in the same query.
`sqlJoinAlgorithm` is a hint to the planner to execute the join in the specified manner. The planner can decide to ignore
the hint if it deduces that the specified algorithm can be detrimental to the performance of the join beforehand. This intelligence
is very limited as of now, and the `sqlJoinAlgorithm` set would be respected in most cases, therefore the user should set it
appropriately. See the advantages and the drawbacks for the [broadcast](#broadcast) and the [sort-merge](#sort-merge) join to
determine which join to use beforehand.
### Broadcast
The default join algorithm for multi-stage queries is a broadcast hash join, which is similar to how
@ -335,53 +343,24 @@ CLUSTERED BY user
The context parameter that sets `sqlJoinAlgorithm` to `sortMerge` is not shown in the above example.
## Durable Storage
## Durable storage
Using durable storage with your SQL-based ingestion can improve their reliability by writing intermediate files to a storage location temporarily.
SQL-based ingestion supports using durable storage to store intermediate files temporarily. Enabling it can improve reliability. For more information, see [Durable storage](../operations/durable-storage.md).
To prevent durable storage from getting filled up with temporary files in case the tasks fail to clean them up, a periodic
cleaner can be scheduled to clean the directories corresponding to which there isn't a controller task running. It utilizes
the storage connector to work upon the durable storage. The durable storage location should only be utilized to store the output
for cluster's MSQ tasks. If the location contains other files or directories, then they will get cleaned up as well.
Enabling durable storage also enables the use of local disk to store temporary files, such as the intermediate files produced
by the super sorter. Tasks will use whatever has been configured for their temporary usage as described in [Configuring task storage sizes](../ingestion/tasks.md#configuring-task-storage-sizes)
If the configured limit is too low, `NotEnoughTemporaryStorageFault` may be thrown.
### Enable durable storage
To enable durable storage, you need to set the following common service properties:
```
druid.msq.intermediate.storage.enable=true
druid.msq.intermediate.storage.type=s3
druid.msq.intermediate.storage.bucket=YOUR_BUCKET
druid.msq.intermediate.storage.prefix=YOUR_PREFIX
druid.msq.intermediate.storage.tempDir=/path/to/your/temp/dir
```
For detailed information about the settings related to durable storage, see [Durable storage configurations](#durable-storage-configurations).
### Use durable storage for queries
When you run a query, include the context parameter `durableShuffleStorage` and set it to `true`.
For queries where you want to use fault tolerance for workers, set `faultTolerance` to `true`, which automatically sets `durableShuffleStorage` to `true`.
## Durable storage configurations
### Durable storage configurations
The following common service properties control how durable storage behaves:
|Parameter |Default | Description |
|-------------------|----------------------------------------|----------------------|
|`druid.msq.intermediate.storage.bucket` | n/a | The bucket in S3 where you want to store intermediate files. |
|`druid.msq.intermediate.storage.chunkSize` | 100MiB | Optional. Defines the size of each chunk to temporarily store in `druid.msq.intermediate.storage.tempDir`. The chunk size must be between 5 MiB and 5 GiB. A large chunk size reduces the API calls made to the durable storage, however it requires more disk space to store the temporary chunks. Druid uses a default of 100MiB if the value is not provided.|
|`druid.msq.intermediate.storage.enable` | true | Required. Whether to enable durable storage for the cluster.|
|`druid.msq.intermediate.storage.maxRetry` | 10 | Optional. Defines the max number times to attempt S3 API calls to avoid failures due to transient errors. |
|`druid.msq.intermediate.storage.prefix` | n/a | S3 prefix to store intermediate stage results. Provide a unique value for the prefix. Don't share the same prefix between clusters. If the location includes other files or directories, then they will get cleaned up as well. |
|`druid.msq.intermediate.storage.enable` | true | Required. Whether to enable durable storage for the cluster. For more information about enabling durable storage, see [Durable storage](../operations/durable-storage.md).|
|`druid.msq.intermediate.storage.type` | `s3` for Amazon S3 | Required. The type of storage to use. `s3` is the only supported storage type. |
|`druid.msq.intermediate.storage.bucket` | n/a | The S3 bucket to store intermediate files. |
|`druid.msq.intermediate.storage.prefix` | n/a | S3 prefix to store intermediate stage results. Provide a unique value for the prefix. Don't share the same prefix between clusters. If the location includes other files or directories, then they will get cleaned up as well. |
|`druid.msq.intermediate.storage.tempDir`| n/a | Required. Directory path on the local disk to temporarily store intermediate stage results. |
|`druid.msq.intermediate.storage.type` | `s3` if your deep storage is S3 | Required. The type of storage to use. You can either set this to `local` or `s3`. |
|`druid.msq.intermediate.storage.maxRetry` | 10 | Optional. Defines the max number times to attempt S3 API calls to avoid failures due to transient errors. |
|`druid.msq.intermediate.storage.chunkSize` | 100MiB | Optional. Defines the size of each chunk to temporarily store in `druid.msq.intermediate.storage.tempDir`. The chunk size must be between 5 MiB and 5 GiB. A large chunk size reduces the API calls made to the durable storage, however it requires more disk space to store the temporary chunks. Druid uses a default of 100MiB if the value is not provided.|
In addition to the common service properties, there are certain properties that you configure on the Overlord specifically to clean up intermediate files:
@ -439,7 +418,7 @@ The following table describes error codes you may encounter in the `multiStageQu
| <a name="error_TooManyInputFiles">`TooManyInputFiles`</a> | Exceeded the maximum number of input files or segments per worker (10,000 files or segments).<br /><br />If you encounter this limit, consider adding more workers, or breaking up your query into smaller queries that process fewer files or segments per query. | `numInputFiles`: The total number of input files/segments for the stage.<br /><br />`maxInputFiles`: The maximum number of input files/segments per worker per stage.<br /><br />`minNumWorker`: The minimum number of workers required for a successful run. |
| <a name="error_TooManyPartitions">`TooManyPartitions`</a> | Exceeded the maximum number of partitions for a stage (25,000 partitions).<br /><br />This can occur with INSERT or REPLACE statements that generate large numbers of segments, since each segment is associated with a partition. If you encounter this limit, consider breaking up your INSERT or REPLACE statement into smaller statements that process less data per statement. | `maxPartitions`: The limit on partitions which was exceeded |
| <a name="error_TooManyClusteredByColumns">`TooManyClusteredByColumns`</a> | Exceeded the maximum number of clustering columns for a stage (1,500 columns).<br /><br />This can occur with `CLUSTERED BY`, `ORDER BY`, or `GROUP BY` with a large number of columns. | `numColumns`: The number of columns requested.<br /><br />`maxColumns`: The limit on columns which was exceeded.`stage`: The stage number exceeding the limit<br /><br /> |
| <a name="error_TooManyRowsWithSameKey">`TooManyRowsWithSameKey`</a> | The number of rows for a given key exceeded the maximum number of buffered bytes on both sides of a join. See the [Limits](#limits) table for the specific limit. Only occurs when `sqlJoinAlgorithm` is `sortMerge`. | `key`: The key that had a large number of rows.<br /><br />`numBytes`: Number of bytes buffered, which may include other keys.<br /><br />`maxBytes`: Maximum number of bytes buffered. |
| <a name="error_TooManyRowsWithSameKey">`TooManyRowsWithSameKey`</a> | The number of rows for a given key exceeded the maximum number of buffered bytes on both sides of a join. See the [Limits](#limits) table for the specific limit. Only occurs when join is executed via the sort-merge join algorithm. | `key`: The key that had a large number of rows.<br /><br />`numBytes`: Number of bytes buffered, which may include other keys.<br /><br />`maxBytes`: Maximum number of bytes buffered. |
| <a name="error_TooManyColumns">`TooManyColumns`</a> | Exceeded the maximum number of columns for a stage (2,000 columns). | `numColumns`: The number of columns requested.<br /><br />`maxColumns`: The limit on columns which was exceeded. |
| <a name="error_TooManyWarnings">`TooManyWarnings`</a> | Exceeded the maximum allowed number of warnings of a particular type. | `rootErrorCode`: The error code corresponding to the exception that exceeded the required limit. <br /><br />`maxWarnings`: Maximum number of warnings that are allowed for the corresponding `rootErrorCode`. |
| <a name="error_TooManyWorkers">`TooManyWorkers`</a> | Exceeded the maximum number of simultaneously-running workers. See the [Limits](#limits) table for more details. | `workers`: The number of simultaneously running workers that exceeded a hard or soft limit. This may be larger than the number of workers in any one stage if multiple stages are running simultaneously. <br /><br />`maxWorkers`: The hard or soft limit on workers that was exceeded. If this is lower than the hard limit (1,000 workers), then you can increase the limit by adding more memory to each task. |
@ -448,4 +427,3 @@ The following table describes error codes you may encounter in the `multiStageQu
| <a name="error_WorkerFailed">`WorkerFailed`</a> | A worker task failed unexpectedly. | `errorMsg`<br /><br />`workerTaskId`: The ID of the worker task. |
| <a name="error_WorkerRpcFailed">`WorkerRpcFailed`</a> | A remote procedure call to a worker task failed and could not recover. | `workerTaskId`: the id of the worker task |
| <a name="error_UnknownError">`UnknownError`</a> | All other errors. | `message` |
| <a name="error_InsertCannotOrderByDescending">`InsertCannotOrderByDescending`</a> | Deprecated. An INSERT query contained a `CLUSTERED BY` expression in descending order. Druid's segment generation code only supports ascending order. The query returns a `ValidationException` instead of the fault. | `columnName` |

View File

@ -0,0 +1,86 @@
---
id: durable-storage
title: "Durable storage for the multi-stage query engine"
sidebar_label: "Durable storage"
---
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
You can use durable storage to improve querying from deep storage and SQL-based ingestion.
> Note that only S3 is supported as a durable storage location.
Durable storage for queries from deep storage provides a location where you can write the results of deep storage queries to. Durable storage for SQL-based ingestion is used to temporarily house intermediate files, which can improve reliability.
Enabling durable storage also enables the use of local disk to store temporary files, such as the intermediate files produced
while sorting the data. Tasks will use whatever has been configured for their temporary usage as described in [Configuring task storage sizes](../ingestion/tasks.md#configuring-task-storage-sizes).
If the configured limit is too low, Druid may throw the error, `NotEnoughTemporaryStorageFault`.
## Enable durable storage
To enable durable storage, you need to set the following common service properties:
```
druid.msq.intermediate.storage.enable=true
druid.msq.intermediate.storage.type=s3
druid.msq.intermediate.storage.bucket=YOUR_BUCKET
druid.msq.intermediate.storage.prefix=YOUR_PREFIX
druid.msq.intermediate.storage.tempDir=/path/to/your/temp/dir
```
For detailed information about the settings related to durable storage, see [Durable storage configurations](../multi-stage-query/reference.md#durable-storage-configurations).
## Use durable storage for SQL-based ingestion queries
When you run a query, include the context parameter `durableShuffleStorage` and set it to `true`.
For queries where you want to use fault tolerance for workers, set `faultTolerance` to `true`, which automatically sets `durableShuffleStorage` to `true`.
## Use durable storage for queries from deep storage
Depending on the size of the results you're expecting, saving the final results for queries from deep storage to durable storage might be needed.
By default, Druid saves the final results for queries from deep storage to task reports. Generally, this is acceptable for smaller result sets but may lead to timeouts for larger result sets.
When you run a query, include the context parameter `selectDestination` and set it to `DURABLESTORAGE`:
```json
"context":{
...
"selectDestination": "DURABLESTORAGE"
}
```
You can also write intermediate results to durable storage (`durableShuffleStorage`) for better reliability. The location where workers write intermediate results is different than the location where final results get stored. This means that durable storage for results can be enabled even if you don't write intermediate results to durable storage.
If you write the results for queries from deep storage to durable storage, the results are cleaned up when the task is removed from the metadata store.
## Durable storage clean up
To prevent durable storage from getting filled up with temporary files in case the tasks fail to clean them up, a periodic
cleaner can be scheduled to clean the directories corresponding to which there isn't a controller task running. It utilizes
the storage connector to work upon the durable storage. The durable storage location should only be utilized to store the output
for the cluster's MSQ tasks. If the location contains other files or directories, then they will get cleaned up as well.
Use `druid.msq.intermediate.storage.cleaner.enabled` and `druid.msq.intermediate.storage.cleaner.delaySEconds` to configure the cleaner. For more information, see [Durable storage configurations](../multi-stage-query/reference.md#durable-storage-configurations).
Note that if you choose to write query results to durable storage,the results are cleaned up when the task is removed from the metadata store.

View File

@ -27,14 +27,13 @@ a Java runtime for Druid.
## Selecting a Java runtime
Druid fully supports Java 8 and 11, and has experimental support for [Java 17](#java-17).
The project team recommends Java 11.
Druid fully supports Java 8u92+, Java 11, and Java 17. The project team recommends Java 17.
The project team recommends using an OpenJDK-based Java distribution. There are many free and actively-supported
distributions available, including
[Amazon Corretto](https://docs.aws.amazon.com/corretto/latest/corretto-11-ug/what-is-corretto-11.html),
[Azul Zulu](https://www.azul.com/downloads/?version=java-11-lts&package=jdk), and
[Eclipse Temurin](https://adoptium.net/temurin/releases?version=11).
[Amazon Corretto](https://docs.aws.amazon.com/corretto/latest/corretto-17-ug/what-is-corretto-17.html),
[Azul Zulu](https://www.azul.com/downloads/?version=java-17-lts&package=jdk), and
[Eclipse Temurin](https://adoptium.net/temurin/releases?version=17).
The project team does not recommend any specific distribution over any other.
Druid relies on the environment variables `JAVA_HOME` or `DRUID_JAVA_HOME` to find Java on the machine. You can set
@ -44,7 +43,8 @@ Druid relies on the environment variables `JAVA_HOME` or `DRUID_JAVA_HOME` to fi
## Garbage collection
In general, the project team recommends using the G1 collector with default settings. This is the default collector in
Java 11. To enable G1 on Java 8, use `-XX:+UseG1GC`. There is no harm in explicitly specifying this on Java 11 as well.
Java 11 and 17. To enable G1 on Java 8, use `-XX:+UseG1GC`. There is no harm in explicitly specifying this on Java 11
or 17 as well.
Garbage collector selection and tuning is a form of sport in the Java community. There may be situations where adjusting
garbage collection configuration improves or worsens performance. The project team's guidance is that most people do
@ -52,10 +52,11 @@ not need to stray away from G1 with default settings.
## Strong encapsulation
Java 9 and beyond (including Java 11) include the capability for
Java 9 and beyond (including Java 11 and 17) include the capability for
[strong encapsulation](https://dev.java/learn/strong-encapsulation-\(of-jdk-internals\)/) of internal JDK APIs. Druid
uses certain internal JDK APIs for functionality- and performance-related reasons. In Java 11, this leads to log
messages like the following:
uses certain internal JDK APIs, which must be added to `--add-exports` and `--add-opens` on the Java command line.
On Java 11, if these parameters are not included, you will see warnings like the following:
```
WARNING: An illegal reflective access operation has occurred
@ -63,51 +64,35 @@ WARNING: Use --illegal-access=warn to enable warnings of further illegal reflect
WARNING: All illegal access operations will be denied in a future release
```
These warning messages are harmless, and can be ignored. However, you can avoid them entirely if you wish by adding the
following Java command line parameters. These parameters are not part of the default configurations that ship with
Druid, because Java 8 does not recognize these parameters and fails to start up if they are provided.
To do this, add the following lines to your `jvm.config` files:
On Java 17, if these parameters are not included, you will see errors on startup like the following:
```
--add-exports=java.base/jdk.internal.ref=ALL-UNNAMED
--add-exports=java.base/jdk.internal.misc=ALL-UNNAMED
--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
Exception in thread "main" java.lang.ExceptionInInitializerError
```
Additionally, tasks run by [MiddleManagers](../design/architecture.md) execute in separate JVMs. The command line for
these JVMs is given by `druid.indexer.runner.javaOptsArray` or `druid.indexer.runner.javaOpts` in
`middleManager/runtime.properties`. Java command line parameters for tasks must be specified here. For example, use
a line like the following:
Druid's out-of-box configuration adds these parameters transparently when you use the bundled `bin/start-druid` or
similar commands. In this case, there is nothing special you need to do to run successfully on Java 11 or 17. However,
if you have customized your Druid service launching system, you will need to ensure the required Java parameters are
added. There are many ways of doing this. Choose the one that works best for you.
1. The simplest approach: use Druid's bundled `bin/start-druid` script to launch Druid.
2. If you launch Druid using `bin/supervise -c <config>`, ensure your config file uses `bin/run-druid`. This
script uses `bin/run-java` internally, and automatically adds the proper flags.
3. If you launch Druid using a `java` command, replace `java` with `bin/run-java`. Druid's bundled
`bin/run-java` script automatically adds the proper flags.
4. If you launch Druid without using its bundled scripts, ensure the following parameters are added to your Java
command line:
```
druid.indexer.runner.javaOptsArray=["-server","-Xms1g","-Xmx1g","-XX:MaxDirectMemorySize=1g","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager","--add-exports=java.base/jdk.internal.ref=ALL-UNNAMED","--add-exports=java.base/jdk.internal.misc=ALL-UNNAMED","--add-opens=java.base/java.lang=ALL-UNNAMED","--add-opens=java.base/java.io=ALL-UNNAMED","--add-opens=java.base/java.nio=ALL-UNNAMED","--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED","--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"]
```
The `Xms`, `Xmx`, and `MaxDirectMemorySize` parameters in the line above are merely an example. You may use different
values in your specific environment.
## Java 17
Druid has experimental support for Java 17.
An important change in Java 17 is that [strong encapsulation](#strong-encapsulation) is enabled by default. The various
`--add-opens` and `--add-exports` parameters listed in the [strong encapsulation](#strong-encapsulation) section are
required in all `jvm.config` files and in `druid.indexer.runner.javaOpts` or `druid.indexer.runner.javaOptsArray` on
MiddleManagers. Failure to include these parameters leads to failure of various operations.
In addition, Druid's launch scripts detect Java 17 and log the following message rather than starting up:
```
Druid requires Java 8 or 11. Your current version is: 17.X.Y.
```
You can skip this check with an environment variable:
```
export DRUID_SKIP_JAVA_CHECK=1
--add-exports=java.base/jdk.internal.misc=ALL-UNNAMED \
--add-exports=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/java.nio=ALL-UNNAMED \
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/java.io=ALL-UNNAMED \
--add-opens=java.base/java.lang=ALL-UNNAMED \
--add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED
```

View File

@ -40,13 +40,13 @@ Metrics may have additional dimensions beyond those listed above.
## Query metrics
### Router
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`query/time`|Milliseconds taken to complete a query.|Native Query: `dataSource`, `type`, `interval`, `hasFilters`, `duration`, `context`, `remoteAddress`, `id`.|< 1s|
### Broker
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`query/time`|Milliseconds taken to complete a query.|<p>Common: `dataSource`, `type`, `interval`, `hasFilters`, `duration`, `context`, `remoteAddress`, `id`.</p><p>Aggregation Queries: `numMetrics`, `numComplexMetrics`.</p><p>GroupBy: `numDimensions`.</p><p> TopN: `threshold`, `dimension`.</p>|< 1s|
|`query/bytes`|The total number of bytes returned to the requesting client in the query response from the broker. Other services report the total bytes for their portion of the query. |<p>Common: `dataSource`, `type`, `interval`, `hasFilters`, `duration`, `context`, `remoteAddress`, `id`.</p><p> Aggregation Queries: `numMetrics`, `numComplexMetrics`.</p><p> GroupBy: `numDimensions`.</p><p> TopN: `threshold`, `dimension`.</p>| |
@ -64,16 +64,16 @@ Metrics may have additional dimensions beyond those listed above.
|`sqlQuery/time`|Milliseconds taken to complete a SQL query.|`id`, `nativeQueryIds`, `dataSource`, `remoteAddress`, `success`, `engine`|< 1s|
|`sqlQuery/planningTimeMs`|Milliseconds taken to plan a SQL to native query.|`id`, `nativeQueryIds`, `dataSource`, `remoteAddress`, `success`, `engine`| |
|`sqlQuery/bytes`|Number of bytes returned in the SQL query response.|`id`, `nativeQueryIds`, `dataSource`, `remoteAddress`, `success`, `engine`| |
|`init/serverview/time`|Time taken to initialize the broker server view. Useful to detect if brokers are taking too long to start.||Depends on the number of segments.|
|`init/metadatacache/time`|Time taken to initialize the broker segment metadata cache. Useful to detect if brokers are taking too long to start||Depends on the number of segments.|
|`segment/metadatacache/refresh/count`|Number of segments to refresh in broker segment metadata cache.|`dataSource`|
|`segment/metadatacache/refresh/time`|Time taken to refresh segments in broker segment metadata cache.|`dataSource`|
|`segment/serverview/sync/healthy`|Sync status of the Broker with a segment-loading server such as a Historical or Peon. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled. This metric can be used in conjunction with `segment/serverview/sync/unstableTime` to debug slow startup of Brokers.|`server`, `tier`|1 for fully synced servers, 0 otherwise|
|`segment/serverview/sync/unstableTime`|Time in milliseconds for which the Broker has been failing to sync with a segment-loading server. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled.|`server`, `tier`|Not emitted for synced servers.|
|`serverview/init/time`|Time taken to initialize the broker server view. Useful to detect if brokers are taking too long to start.||Depends on the number of segments.|
|`metadatacache/init/time`|Time taken to initialize the broker segment metadata cache. Useful to detect if brokers are taking too long to start||Depends on the number of segments.|
|`metadatacache/refresh/count`|Number of segments to refresh in broker segment metadata cache.|`dataSource`|
|`metadatacache/refresh/time`|Time taken to refresh segments in broker segment metadata cache.|`dataSource`|
|`serverview/sync/healthy`|Sync status of the Broker with a segment-loading server such as a Historical or Peon. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled. This metric can be used in conjunction with `serverview/sync/unstableTime` to debug slow startup of Brokers.|`server`, `tier`|1 for fully synced servers, 0 otherwise|
|`serverview/sync/unstableTime`|Time in milliseconds for which the Broker has been failing to sync with a segment-loading server. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled.|`server`, `tier`|Not emitted for synced servers.|
### Historical
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`query/time`|Milliseconds taken to complete a query.|<p>Common: `dataSource`, `type`, `interval`, `hasFilters`, `duration`, `context`, `remoteAddress`, `id`.</p><p> Aggregation Queries: `numMetrics`, `numComplexMetrics`.</p><p> GroupBy: `numDimensions`.</p><p> TopN: `threshold`, `dimension`.</p>|< 1s|
|`query/segment/time`|Milliseconds taken to query individual segment. Includes time to page in the segment from disk.|`id`, `status`, `segment`, `vectorized`.|several hundred milliseconds|
@ -89,7 +89,7 @@ Metrics may have additional dimensions beyond those listed above.
### Real-time
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`query/time`|Milliseconds taken to complete a query.|<p>Common: `dataSource`, `type`, `interval`, `hasFilters`, `duration`, `context`, `remoteAddress`, `id`.</p><p> Aggregation Queries: `numMetrics`, `numComplexMetrics`.</p><p> GroupBy: `numDimensions`.</p><p> TopN: `threshold`, `dimension`.</p>|< 1s|
|`query/wait/time`|Milliseconds spent waiting for a segment to be scanned.|`id`, `segment`|several hundred milliseconds|
@ -103,7 +103,7 @@ Metrics may have additional dimensions beyond those listed above.
### Jetty
|Metric|Description|Normal Value|
|Metric|Description|Normal value|
|------|-----------|------------|
|`jetty/numOpenConnections`|Number of open jetty connections.|Not much higher than number of jetty threads.|
|`jetty/threadPool/total`|Number of total workable threads allocated.|The number should equal to `threadPoolNumIdleThreads` + `threadPoolNumBusyThreads`.|
@ -116,7 +116,7 @@ Metrics may have additional dimensions beyond those listed above.
### Cache
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`query/cache/delta/*`|Cache metrics since the last emission.||N/A|
|`query/cache/total/*`|Total cache metrics.||N/A|
@ -137,7 +137,7 @@ Metrics may have additional dimensions beyond those listed above.
Memcached client metrics are reported as per the following. These metrics come directly from the client as opposed to from the cache retrieval layer.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`query/cache/memcached/total`|Cache metrics unique to memcached (only if `druid.cache.type=memcached`) as their actual values.|Variable|N/A|
|`query/cache/memcached/delta`|Cache metrics unique to memcached (only if `druid.cache.type=memcached`) as their delta from the prior event emission.|Variable|N/A|
@ -146,7 +146,7 @@ Memcached client metrics are reported as per the following. These metrics come d
If SQL is enabled, the Broker will emit the following metrics for SQL.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`sqlQuery/time`|Milliseconds taken to complete a SQL.|`id`, `nativeQueryIds`, `dataSource`, `remoteAddress`, `success`|< 1s|
|`sqlQuery/planningTimeMs`|Milliseconds taken to plan a SQL to native query.|`id`, `nativeQueryIds`, `dataSource`, `remoteAddress`, `success`| |
@ -156,8 +156,8 @@ If SQL is enabled, the Broker will emit the following metrics for SQL.
## General native ingestion metrics
|Metric|Description| Dimensions |Normal Value|
|------|-----------|---------------------------------------------------------|------------|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`ingest/count`|Count of `1` every time an ingestion job runs (includes compaction jobs). Aggregate using dimensions. | `dataSource`, `taskId`, `taskType`, `groupId`, `taskIngestionMode`, `tags` |Always `1`.|
|`ingest/segments/count`|Count of final segments created by job (includes tombstones). | `dataSource`, `taskId`, `taskType`, `groupId`, `taskIngestionMode`, `tags` |At least `1`.|
|`ingest/tombstones/count`|Count of tombstones created by job. | `dataSource`, `taskId`, `taskType`, `groupId`, `taskIngestionMode`, `tags` |Zero or more for replace. Always zero for non-replace tasks (always zero for legacy replace, see below).|
@ -171,12 +171,12 @@ The mode is decided using the values
of the `isAppendToExisting` and `isDropExisting` flags in the
task's `IOConfig` as follows:
| `isAppendToExisting` | `isDropExisting` | mode |
|----------------------|-------------------|------|
| `true` | `false` | `APPEND`|
| `true` | `true ` | Invalid combination, exception thrown. |
| `false` | `false` | `REPLACE_LEGACY` (this is the default for native batch ingestion). |
| `false` | `true` | `REPLACE`|
|`isAppendToExisting`|`isDropExisting`|Mode|
|--------------------|----------------|----|
|`true`|`false`|`APPEND`|
|`true`|`true `|Invalid combination, exception thrown.|
|`false`|`false`|`REPLACE_LEGACY`. The default for JSON-based batch ingestion. |
|`false`|`true`|`REPLACE`|
The `tags` dimension is reported only for metrics emitted from ingestion tasks whose ingest spec specifies the `tags`
field in the `context` field of the ingestion spec. `tags` is expected to be a map of string to object.
@ -185,7 +185,7 @@ field in the `context` field of the ingestion spec. `tags` is expected to be a m
These metrics apply to the [Kafka indexing service](../development/extensions-core/kafka-ingestion.md).
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`ingest/kafka/lag`|Total lag between the offsets consumed by the Kafka indexing tasks and latest offsets in Kafka brokers across all partitions. Minimum emission period for this metric is a minute.|`dataSource`, `stream`, `tags`|Greater than 0, should not be a very high number. |
|`ingest/kafka/maxLag`|Max lag between the offsets consumed by the Kafka indexing tasks and latest offsets in Kafka brokers across all partitions. Minimum emission period for this metric is a minute.|`dataSource`, `stream`, `tags`|Greater than 0, should not be a very high number. |
@ -196,7 +196,7 @@ These metrics apply to the [Kafka indexing service](../development/extensions-co
These metrics apply to the [Kinesis indexing service](../development/extensions-core/kinesis-ingestion.md).
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`ingest/kinesis/lag/time`|Total lag time in milliseconds between the current message sequence number consumed by the Kinesis indexing tasks and latest sequence number in Kinesis across all shards. Minimum emission period for this metric is a minute.|`dataSource`, `stream`, `tags`|Greater than 0, up to max Kinesis retention period in milliseconds. |
|`ingest/kinesis/maxLag/time`|Max lag time in milliseconds between the current message sequence number consumed by the Kinesis indexing tasks and latest sequence number in Kinesis across all shards. Minimum emission period for this metric is a minute.|`dataSource`, `stream`, `tags`|Greater than 0, up to max Kinesis retention period in milliseconds. |
@ -208,35 +208,37 @@ These metrics apply to the [Kinesis indexing service](../development/extensions-
Streaming ingestion tasks and certain types of
batch ingestion emit the following metrics. These metrics are deltas for each emission period.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`ingest/events/thrownAway`|Number of events rejected because they are either null, or filtered by the transform spec, or outside the windowPeriod.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|0|
|`ingest/events/processed`|Number of events processed per emission period.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Equal to the number of events per emission period.|
|`ingest/events/processedWithError`|Number of events processed with some partial errors per emission period. Events processed with partial errors are counted towards both this metric and `ingest/events/processed`.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|0|
|`ingest/events/unparseable`|Number of events rejected because the events are unparseable.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|0|
|`ingest/events/thrownAway`|Number of events rejected because they are null, or filtered by `transformSpec`, or outside one of `lateMessageRejectionPeriod`, `earlyMessageRejectionPeriod`, or `windowPeriod`.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|0|
|`ingest/events/duplicate`|Number of events rejected because the events are duplicated.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|0|
|`ingest/events/processed`|Number of events successfully processed per emission period.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Equal to the number of events per emission period.|
|`ingest/input/bytes`|Number of bytes read from input sources, after decompression but prior to parsing. This covers all data read, including data that does not end up being fully processed and ingested. For example, this includes data that ends up being rejected for being unparseable or filtered out.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the amount of data read.|
|`ingest/rows/output`|Number of Druid rows persisted.|`dataSource`, `taskId`, `taskType`, `groupId`|Your number of events with rollup.|
|`ingest/persists/count`|Number of times persist occurred.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on configuration.|
|`ingest/persists/time`|Milliseconds spent doing intermediate persist.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on configuration. Generally a few minutes at most.|
|`ingest/persists/cpu`|Cpu time in Nanoseconds spent on doing intermediate persist.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on configuration. Generally a few minutes at most.|
|`ingest/persists/count`|Number of times persist occurred.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the configuration.|
|`ingest/persists/time`|Milliseconds spent doing intermediate persist.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the configuration. Generally a few minutes at most.|
|`ingest/persists/cpu`|CPU time in nanoseconds spent on doing intermediate persist.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the configuration. Generally a few minutes at most.|
|`ingest/persists/backPressure`|Milliseconds spent creating persist tasks and blocking waiting for them to finish.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|0 or very low|
|`ingest/persists/failed`|Number of persists that failed.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|0|
|`ingest/handoff/failed`|Number of handoffs that failed.|`dataSource`, `taskId`, `taskType`, `groupId`,`tags`|0|
|`ingest/merge/time`|Milliseconds spent merging intermediate segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on configuration. Generally a few minutes at most.|
|`ingest/merge/cpu`|Cpu time in Nanoseconds spent on merging intermediate segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on configuration. Generally a few minutes at most.|
|`ingest/merge/time`|Milliseconds spent merging intermediate segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the configuration. Generally a few minutes at most.|
|`ingest/merge/cpu`|CPU time in Nanoseconds spent on merging intermediate segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the configuration. Generally a few minutes at most.|
|`ingest/handoff/count`|Number of handoffs that happened.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Varies. Generally greater than 0 once every segment granular period if cluster operating normally.|
|`ingest/sink/count`|Number of sinks not handoffed.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|1~3|
|`ingest/events/messageGap`|Time gap in milliseconds between the latest ingested event timestamp and the current system timestamp of metrics emission. If the value is increasing but lag is low, Druid may not be receiving new data. This metric is reset as new tasks spawn up.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Greater than 0, depends on the time carried in event. |
|`ingest/sink/count`|Number of sinks not handed off.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|1~3|
|`ingest/events/messageGap`|Time gap in milliseconds between the latest ingested event timestamp and the current system timestamp of metrics emission. If the value is increasing but lag is low, Druid may not be receiving new data. This metric is reset as new tasks spawn up.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Greater than 0, depends on the time carried in event.|
|`ingest/notices/queueSize`|Number of pending notices to be processed by the coordinator.|`dataSource`, `tags`|Typically 0 and occasionally in lower single digits. Should not be a very high number. |
|`ingest/notices/time`|Milliseconds taken to process a notice by the supervisor.|`dataSource`, `tags`| < 1s |
|`ingest/pause/time`|Milliseconds spent by a task in a paused state without ingesting.|`dataSource`, `taskId`, `tags`| < 10 seconds|
|`ingest/handoff/time`|Total number of milliseconds taken to handoff a set of segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on coordinator cycle time.|
|`ingest/handoff/time`|Total number of milliseconds taken to handoff a set of segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the coordinator cycle time.|
Note: If the JVM does not support CPU time measurement for the current thread, `ingest/merge/cpu` and `ingest/persists/cpu` will be 0.
If the JVM does not support CPU time measurement for the current thread, `ingest/merge/cpu` and `ingest/persists/cpu` will be 0.
## Indexing service
|Metric|Description| Dimensions |Normal Value|
|------|-----------|------------------------------------------------------------|------------|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`task/run/time`|Milliseconds taken to run a task.| `dataSource`, `taskId`, `taskType`, `groupId`, `taskStatus`, `tags`|Varies|
|`task/pending/time`|Milliseconds taken for a task to wait for running.| `dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Varies|
|`task/action/log/time`|Milliseconds taken to log a task action to the audit log.| `dataSource`, `taskId`, `taskType`, `groupId`, `taskActionType`, `tags`|< 1000 (subsecond)|
@ -251,19 +253,19 @@ Note: If the JVM does not support CPU time measurement for the current thread, `
|`segment/added/bytes`|Size in bytes of new segments created.| `dataSource`, `taskId`, `taskType`, `groupId`, `interval`, `tags`|Varies|
|`segment/moved/bytes`|Size in bytes of segments moved/archived via the Move Task.| `dataSource`, `taskId`, `taskType`, `groupId`, `interval`, `tags`|Varies|
|`segment/nuked/bytes`|Size in bytes of segments deleted via the Kill Task.| `dataSource`, `taskId`, `taskType`, `groupId`, `interval`, `tags`|Varies|
|`task/success/count`|Number of successful tasks per emission period. This metric is only available if the TaskCountStatsMonitor module is included.| `dataSource`|Varies|
|`task/failed/count`|Number of failed tasks per emission period. This metric is only available if the TaskCountStatsMonitor module is included.|`dataSource`|Varies|
|`task/success/count`|Number of successful tasks per emission period. This metric is only available if the `TaskCountStatsMonitor` module is included.| `dataSource`|Varies|
|`task/failed/count`|Number of failed tasks per emission period. This metric is only available if the `TaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|`task/running/count`|Number of current running tasks. This metric is only available if the `TaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|`task/pending/count`|Number of current pending tasks. This metric is only available if the `TaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|`task/waiting/count`|Number of current waiting tasks. This metric is only available if the `TaskCountStatsMonitor` module is included.|`dataSource`|Varies|
|`taskSlot/total/count`|Number of total task slots per emission period. This metric is only available if the `TaskSlotCountStatsMonitor` module is included.| `category`|Varies|
|`taskSlot/idle/count`|Number of idle task slots per emission period. This metric is only available if the `TaskSlotCountStatsMonitor` module is included.| `category`|Varies|
|`taskSlot/used/count`|Number of busy task slots per emission period. This metric is only available if the `TaskSlotCountStatsMonitor` module is included.| `category`|Varies|
|`taskSlot/lazy/count`|Number of total task slots in lazy marked MiddleManagers and Indexers per emission period. This metric is only available if the `TaskSlotCountStatsMonitor` module is included.| `category`|Varies|
|`taskSlot/blacklisted/count`|Number of total task slots in blacklisted MiddleManagers and Indexers per emission period. This metric is only available if the `TaskSlotCountStatsMonitor` module is included.| `category`|Varies|
|`worker/task/failed/count`|Number of failed tasks run on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included, and is only supported for middleManager nodes.| `category`, `workerVersion`|Varies|
|`worker/task/success/count`|Number of successful tasks run on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included, and is only supported for middleManager nodes.| `category`,`workerVersion`|Varies|
|`worker/taskSlot/idle/count`|Number of idle task slots on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included, and is only supported for middleManager nodes.| `category`, `workerVersion`|Varies|
|`taskSlot/lazy/count`|Number of total task slots in lazy marked Middle Managers and Indexers per emission period. This metric is only available if the `TaskSlotCountStatsMonitor` module is included.| `category`|Varies|
|`taskSlot/blacklisted/count`|Number of total task slots in blacklisted Middle Managers and Indexers per emission period. This metric is only available if the `TaskSlotCountStatsMonitor` module is included.| `category`|Varies|
|`worker/task/failed/count`|Number of failed tasks run on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included, and is only supported for Middle Manager nodes.| `category`, `workerVersion`|Varies|
|`worker/task/success/count`|Number of successful tasks run on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included, and is only supported for Middle Manager nodes.| `category`,`workerVersion`|Varies|
|`worker/taskSlot/idle/count`|Number of idle task slots on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included, and is only supported for Middle Manager nodes.| `category`, `workerVersion`|Varies|
|`worker/taskSlot/total/count`|Number of total task slots on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.| `category`, `workerVersion`|Varies|
|`worker/taskSlot/used/count`|Number of busy task slots on the reporting worker per emission period. This metric is only available if the `WorkerTaskCountStatsMonitor` module is included.| `category`, `workerVersion`|Varies|
@ -272,7 +274,7 @@ Note: If the JVM does not support CPU time measurement for the current thread, `
The shuffle metrics can be enabled by adding `org.apache.druid.indexing.worker.shuffle.ShuffleMonitor` in `druid.monitoring.monitors`
See [Enabling Metrics](../configuration/index.md#enabling-metrics) for more details.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`ingest/shuffle/bytes`|Number of bytes shuffled per emission period.|`supervisorTaskId`|Varies|
|`ingest/shuffle/requests`|Number of shuffle requests per emission period.|`supervisorTaskId`|Varies|
@ -281,32 +283,34 @@ See [Enabling Metrics](../configuration/index.md#enabling-metrics) for more deta
These metrics are for the Druid Coordinator and are reset each time the Coordinator runs the coordination logic.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`segment/assigned/count`|Number of segments assigned to be loaded in the cluster.|`tier`|Varies|
|`segment/moved/count`|Number of segments moved in the cluster.|`tier`|Varies|
|`segment/unmoved/count`|Number of segments which were chosen for balancing but were found to be already optimally placed.|`tier`|Varies|
|`segment/dropped/count`|Number of segments chosen to be dropped from the cluster due to being over-replicated.|`tier`|Varies|
|`segment/deleted/count`|Number of segments marked as unused due to drop rules.| |Varies|
|`segment/unneeded/count`|Number of segments dropped due to being marked as unused.|`tier`|Varies|
|`segment/cost/raw`|Used in cost balancing. The raw cost of hosting segments.|`tier`|Varies|
|`segment/cost/normalization`|Used in cost balancing. The normalization of hosting segments.|`tier`|Varies|
|`segment/cost/normalized`|Used in cost balancing. The normalized cost of hosting segments.|`tier`|Varies|
|`segment/assigned/count`|Number of segments assigned to be loaded in the cluster.|`dataSource`, `tier`|Varies|
|`segment/moved/count`|Number of segments moved in the cluster.|`dataSource`, `tier`|Varies|
|`segment/dropped/count`|Number of segments chosen to be dropped from the cluster due to being over-replicated.|`dataSource`, `tier`|Varies|
|`segment/deleted/count`|Number of segments marked as unused due to drop rules.|`dataSource`|Varies|
|`segment/unneeded/count`|Number of segments dropped due to being marked as unused.|`dataSource`, `tier`|Varies|
|`segment/assignSkipped/count`|Number of segments that could not be assigned to any server for loading. This can occur due to replication throttling, no available disk space, or a full load queue.|`dataSource`, `tier`, `description`|Varies|
|`segment/moveSkipped/count`|Number of segments that were chosen for balancing but could not be moved. This can occur when segments are already optimally placed.|`dataSource`, `tier`, `description`|Varies|
|`segment/dropSkipped/count`|Number of segments that could not be dropped from any server.|`dataSource`, `tier`, `description`|Varies|
|`segment/loadQueue/size`|Size in bytes of segments to load.|`server`|Varies|
|`segment/loadQueue/failed`|Number of segments that failed to load.|`server`|0|
|`segment/loadQueue/count`|Number of segments to load.|`server`|Varies|
|`segment/dropQueue/count`|Number of segments to drop.|`server`|Varies|
|`segment/loadQueue/assigned`|Number of segments assigned for load or drop to the load queue of a server.|`dataSource`, `server`|Varies|
|`segment/loadQueue/success`|Number of segment assignments that completed successfully.|`dataSource`, `server`|Varies|
|`segment/loadQueue/failed`|Number of segment assignments that failed to complete.|`dataSource`, `server`|0|
|`segment/loadQueue/cancelled`|Number of segment assignments that were canceled before completion.|`dataSource`, `server`|Varies|
|`segment/size`|Total size of used segments in a data source. Emitted only for data sources to which at least one used segment belongs.|`dataSource`|Varies|
|`segment/count`|Number of used segments belonging to a data source. Emitted only for data sources to which at least one used segment belongs.|`dataSource`|< max|
|`segment/overShadowed/count`|Number of segments marked as unused due to being overshadowed.| |Varies|
|`segment/unavailable/count`|Number of segments (not including replicas) left to load until segments that should be loaded in the cluster are available for queries.|`dataSource`|0|
|`segment/underReplicated/count`|Number of segments (including replicas) left to load until segments that should be loaded in the cluster are available for queries.|`tier`, `dataSource`|0|
|`segment/unavailable/count`|Number of unique segments left to load until all used segments are available for queries.|`dataSource`|0|
|`segment/underReplicated/count`|Number of segments, including replicas, left to load until all used segments are available for queries.|`tier`, `dataSource`|0|
|`tier/historical/count`|Number of available historical nodes in each tier.|`tier`|Varies|
|`tier/replication/factor`|Configured maximum replication factor in each tier.|`tier`|Varies|
|`tier/required/capacity`|Total capacity in bytes required in each tier.|`tier`|Varies|
|`tier/total/capacity`|Total capacity in bytes available in each tier.|`tier`|Varies|
|`compact/task/count`|Number of tasks issued in the auto compaction run.| |Varies|
|`compactTask/maxSlot/count`|Max number of task slots that can be used for auto compaction tasks in the auto compaction run.| |Varies|
|`compactTask/maxSlot/count`|Maximum number of task slots available for auto compaction tasks in the auto compaction run.| |Varies|
|`compactTask/availableSlot/count`|Number of available task slots that can be used for auto compaction tasks in the auto compaction run. This is the max number of task slots minus any currently running compaction tasks.| |Varies|
|`segment/waitCompact/bytes`|Total bytes of this datasource waiting to be compacted by the auto compaction (only consider intervals/segments that are eligible for auto compaction).|`dataSource`|Varies|
|`segment/waitCompact/count`|Total number of segments of this datasource waiting to be compacted by the auto compaction (only consider intervals/segments that are eligible for auto compaction).|`dataSource`|Varies|
@ -317,28 +321,28 @@ These metrics are for the Druid Coordinator and are reset each time the Coordina
|`segment/skipCompact/bytes`|Total bytes of this datasource that are skipped (not eligible for auto compaction) by the auto compaction.|`dataSource`|Varies|
|`segment/skipCompact/count`|Total number of segments of this datasource that are skipped (not eligible for auto compaction) by the auto compaction.|`dataSource`|Varies|
|`interval/skipCompact/count`|Total number of intervals of this datasource that are skipped (not eligible for auto compaction) by the auto compaction.|`dataSource`|Varies|
|`coordinator/time`|Approximate Coordinator duty runtime in milliseconds. The duty dimension is the string alias of the Duty that is being run.|`duty`|Varies|
|`coordinator/global/time`|Approximate runtime of a full coordination cycle in milliseconds. The `dutyGroup` dimension indicates what type of coordination this run was. i.e. Historical Management vs Indexing|`dutyGroup`|Varies|
|`metadata/kill/supervisor/count`|Total number of terminated supervisors that were automatically deleted from metadata store per each Coordinator kill supervisor duty run. This metric can help adjust `druid.coordinator.kill.supervisor.durationToRetain` configuration based on whether more or less terminated supervisors need to be deleted per cycle. Note that this metric is only emitted when `druid.coordinator.kill.supervisor.on` is set to true.| |Varies|
|`metadata/kill/audit/count`|Total number of audit logs that were automatically deleted from metadata store per each Coordinator kill audit duty run. This metric can help adjust `druid.coordinator.kill.audit.durationToRetain` configuration based on whether more or less audit logs need to be deleted per cycle. Note that this metric is only emitted when `druid.coordinator.kill.audit.on` is set to true.| |Varies|
|`metadata/kill/compaction/count`|Total number of compaction configurations that were automatically deleted from metadata store per each Coordinator kill compaction configuration duty run. Note that this metric is only emitted when `druid.coordinator.kill.compaction.on` is set to true.| |Varies|
|`metadata/kill/rule/count`|Total number of rules that were automatically deleted from metadata store per each Coordinator kill rule duty run. This metric can help adjust `druid.coordinator.kill.rule.durationToRetain` configuration based on whether more or less rules need to be deleted per cycle. Note that this metric is only emitted when `druid.coordinator.kill.rule.on` is set to true.| |Varies|
|`metadata/kill/datasource/count`|Total number of datasource metadata that were automatically deleted from metadata store per each Coordinator kill datasource duty run (Note: datasource metadata only exists for datasource created from supervisor). This metric can help adjust `druid.coordinator.kill.datasource.durationToRetain` configuration based on whether more or less datasource metadata need to be deleted per cycle. Note that this metric is only emitted when `druid.coordinator.kill.datasource.on` is set to true.| |Varies|
|`init/serverview/time`|Time taken to initialize the coordinator server view.||Depends on the number of segments|
|`segment/serverview/sync/healthy`|Sync status of the Coordinator with a segment-loading server such as a Historical or Peon. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled. This metric can be used in conjunction with `segment/serverview/sync/unstableTime` to debug slow startup of the Coordinator.|`server`, `tier`|1 for fully synced servers, 0 otherwise|
|`segment/serverview/sync/unstableTime`|Time in milliseconds for which the Coordinator has been failing to sync with a segment-loading server. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled.|`server`, `tier`|Not emitted for synced servers.|
|`coordinator/time`|Approximate Coordinator duty runtime in milliseconds. |`duty`|Varies|
|`coordinator/global/time`|Approximate runtime of a full coordination cycle in milliseconds. The `dutyGroup` dimension indicates what type of coordination this run was. For example: Historical Management or Indexing.|`dutyGroup`|Varies|
|`metadata/kill/supervisor/count`|Total number of terminated supervisors that were automatically deleted from metadata store per each Coordinator kill supervisor duty run. This metric can help adjust `druid.coordinator.kill.supervisor.durationToRetain` configuration based on whether more or less terminated supervisors need to be deleted per cycle. This metric is only emitted when `druid.coordinator.kill.supervisor.on` is set to true.| |Varies|
|`metadata/kill/audit/count`|Total number of audit logs that were automatically deleted from metadata store per each Coordinator kill audit duty run. This metric can help adjust `druid.coordinator.kill.audit.durationToRetain` configuration based on whether more or less audit logs need to be deleted per cycle. This metric is emitted only when `druid.coordinator.kill.audit.on` is set to true.| |Varies|
|`metadata/kill/compaction/count`|Total number of compaction configurations that were automatically deleted from metadata store per each Coordinator kill compaction configuration duty run. This metric is only emitted when `druid.coordinator.kill.compaction.on` is set to true.| |Varies|
|`metadata/kill/rule/count`|Total number of rules that were automatically deleted from metadata store per each Coordinator kill rule duty run. This metric can help adjust `druid.coordinator.kill.rule.durationToRetain` configuration based on whether more or less rules need to be deleted per cycle. This metric is only emitted when `druid.coordinator.kill.rule.on` is set to true.| |Varies|
|`metadata/kill/datasource/count`|Total number of datasource metadata that were automatically deleted from metadata store per each Coordinator kill datasource duty run. Note that datasource metadata only exists for datasource created from supervisor. This metric can help adjust `druid.coordinator.kill.datasource.durationToRetain` configuration based on whether more or less datasource metadata need to be deleted per cycle. This metric is only emitted when `druid.coordinator.kill.datasource.on` is set to true.| |Varies|
|`serverview/init/time`|Time taken to initialize the coordinator server view.||Depends on the number of segments.|
|`serverview/sync/healthy`|Sync status of the Coordinator with a segment-loading server such as a Historical or Peon. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled. You can use this metric in conjunction with `serverview/sync/unstableTime` to debug slow startup of the Coordinator.|`server`, `tier`|1 for fully synced servers, 0 otherwise|
|`serverview/sync/unstableTime`|Time in milliseconds for which the Coordinator has been failing to sync with a segment-loading server. Emitted only when [HTTP-based server view](../configuration/index.md#segment-management) is enabled.|`server`, `tier`|Not emitted for synced servers.|
## General Health
### Service Health
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
| `service/heartbeat` | Metric indicating the service is up. `ServiceStatusMonitor` must be enabled. |`leader` on the Overlord and Coordinator.|1|
| `service/heartbeat` | Metric indicating the service is up. `ServiceStatusMonitor` must be enabled. | `leader` on the Overlord and Coordinator.<br />`workerVersion`, `category`, `status` on the Middle Manager.<br />`taskId`, `groupId`, `taskType`, `dataSource` on the Peon |1|
### Historical
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`segment/max`|Maximum byte limit available for segments.| |Varies.|
|`segment/used`|Bytes used for served segments.|`dataSource`, `tier`, `priority`|< max|
@ -350,9 +354,10 @@ These metrics are for the Druid Coordinator and are reset each time the Coordina
### JVM
These metrics are only available if the `JVMMonitor` module is included.
These metrics are only available if the `JvmMonitor` module is included in `druid.monitoring.monitors`.
For more information, see [Enabling Metrics](../configuration/index.md#enabling-metrics).
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`jvm/pool/committed`|Committed pool|`poolKind`, `poolName`|Close to max pool|
|`jvm/pool/init`|Initial pool|`poolKind`, `poolName`|Varies|
@ -368,20 +373,29 @@ These metrics are only available if the `JVMMonitor` module is included.
|`jvm/gc/count`|Garbage collection count|`gcName` (cms/g1/parallel/etc.), `gcGen` (old/young)|Varies|
|`jvm/gc/cpu`|Count of CPU time in Nanoseconds spent on garbage collection. Note: `jvm/gc/cpu` represents the total time over multiple GC cycles; divide by `jvm/gc/count` to get the mean GC time per cycle.|`gcName`, `gcGen`|Sum of `jvm/gc/cpu` should be within 10-30% of sum of `jvm/cpu/total`, depending on the GC algorithm used (reported by [`JvmCpuMonitor`](../configuration/index.md#enabling-metrics)). |
### ZooKeeper
These metrics are available only when `druid.zk.service.enabled = true`.
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`zk/connected`|Indicator of connection status. `1` for connected, `0` for disconnected. Emitted once per monitor period.|None|1|
|`zk/reconnect/time`|Amount of time, in milliseconds, that a server was disconnected from ZooKeeper before reconnecting. Emitted on reconnection. Not emitted if connection to ZooKeeper is permanently lost, because in this case, there is no reconnection.|None|Not present|
### EventReceiverFirehose
The following metric is only available if the `EventReceiverFirehoseMonitor` module is included.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`ingest/events/buffered`|Number of events queued in the `EventReceiverFirehose` buffer.|`serviceName`, `dataSource`, `taskId`, `taskType`, `bufferCapacity`|Equal to current number of events in the buffer queue.|
|`ingest/events/buffered`|Number of events queued in the `EventReceiverFirehose` buffer.|`serviceName`, `dataSource`, `taskId`, `taskType`, `bufferCapacity`|Equal to the current number of events in the buffer queue.|
|`ingest/bytes/received`|Number of bytes received by the `EventReceiverFirehose`.|`serviceName`, `dataSource`, `taskId`, `taskType`|Varies|
## Sys
These metrics are only available if the `SysMonitor` module is included.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`sys/swap/free`|Free swap||Varies|
|`sys/swap/max`|Max swap||Varies|
@ -404,12 +418,12 @@ These metrics are only available if the `SysMonitor` module is included.
These metrics are available on operating systems with the cgroup kernel feature. All the values are derived by reading from `/sys/fs/cgroup`.
|Metric|Description|Dimensions|Normal Value|
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`cgroup/cpu/shares`|Relative value of CPU time available to this process. Read from `cpu.shares`.||Varies|
|`cgroup/cpu/cores_quota`|Number of cores available to this process. Derived from `cpu.cfs_quota_us`/`cpu.cfs_period_us`.||Varies. A value of -1 indicates there is no explicit quota set.|
|`cgroup/memory/*`|Memory stats for this process (e.g. `cache`, `total_swap`, etc.). Each stat produces a separate metric. Read from `memory.stat`.||Varies|
|`cgroup/memory_numa/*/pages`|Memory stats, per NUMA node, for this process (e.g. `total`, `unevictable`, etc.). Each stat produces a separate metric. Read from `memory.num_stat`.|`numaZone`|Varies|
|`cgroup/memory/*`|Memory stats for this process, such as `cache` and `total_swap`. Each stat produces a separate metric. Read from `memory.stat`.||Varies|
|`cgroup/memory_numa/*/pages`|Memory stats, per NUMA node, for this process, such as `total` and `unevictable`. Each stat produces a separate metric. Read from `memory.num_stat`.|`numaZone`|Varies|
|`cgroup/cpuset/cpu_count`|Total number of CPUs available to the process. Derived from `cpuset.cpus`.||Varies|
|`cgroup/cpuset/effective_cpu_count`|Total number of active CPUs available to the process. Derived from `cpuset.effective_cpus`.||Varies|
|`cgroup/cpuset/mems_count`|Total number of memory nodes available to the process. Derived from `cpuset.mems`.||Varies|

View File

@ -107,7 +107,7 @@ In the web console you can use the up and down arrows on the right side of the i
## Load rules
Load rules define how Druid assigns segments to [historical process tiers](./mixed-workloads.md#historical-tiering), and how many replicas of a segment exist in each tier.
Load rules define how Druid assigns segments to [Historical process tiers](./mixed-workloads.md#historical-tiering), and how many replicas of a segment exist in each tier.
If you have a single tier, Druid automatically names the tier `_default`. If you define an additional tier, you must define a load rule to specify which segments to load on that tier. Until you define a load rule, your new tier remains empty.
@ -120,6 +120,8 @@ All load rules can have these properties:
Specific types of load rules discussed below may have other properties too.
Load rules are also how you take advantage of the resource savings that [query the data from deep storage](../querying/query-from-deep-storage.md) provides. One way to configure data so that certain segments are not loaded onto Historical tiers but are available to query from deep storage is to set `tieredReplicants` to an empty array and `useDefaultTierForNull` to `false` for those segments, either by interval or by period.
### Forever load rule
The forever load rule assigns all datasource segments to specified tiers. It is the default rule Druid applies to datasources. Forever load rules have type `loadForever`.
@ -167,7 +169,7 @@ Set the following properties:
- the segment interval starts any time after the rule interval starts.
You can use this property to load segments with future start and end dates, where "future" is relative to the time when the Coordinator evaluates data against the rule. Defaults to `true`.
- `tieredReplicants`: a map of tier names to the number of segment replicas for that tier.
- `tieredReplicants`: a map of tier names to the number of segment replicas for that tier.
- `useDefaultTierForNull`: This parameter determines the default value of `tieredReplicants` and only has an effect if the field is not present. The default value of `useDefaultTierForNull` is true.
### Interval load rule
@ -190,7 +192,7 @@ Interval load rules have type `loadByInterval`. The following example places one
Set the following properties:
- `interval`: the load interval specified as an [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) range encoded as a string.
- `tieredReplicants`: a map of tier names to the number of segment replicas for that tier.
- `tieredReplicants`: a map of tier names to the number of segment replicas for that tier.
- `useDefaultTierForNull`: This parameter determines the default value of `tieredReplicants` and only has an effect if the field is not present. The default value of `useDefaultTierForNull` is true.
## Drop rules
@ -256,7 +258,7 @@ Set the following property:
### Interval drop rule
You can use a drop interval rule to prevent Druid from loading a specified range of data onto any tier. The range is typically your oldest data. The dropped data resides in cold storage, but is not queryable. If you need to query the data, update or remove the interval drop rule so that Druid reloads the data.
You can use a drop interval rule to prevent Druid from loading a specified range of data onto any tier. The range is typically your oldest data. The dropped data resides in deep storage and can still be [queried from deep storage](../querying/query-from-deep-storage.md).
Interval drop rules have type `dropByInterval` and the following JSON structure:

View File

@ -52,14 +52,14 @@ The **Home** view displays the following cards:
* __Status__. Click this card for information on the Druid version and any extensions loaded on the cluster.
* [Datasources](#datasources)
* [Segments](#segments)
* [Supervisors](#supervisors-and-tasks)
* [Tasks](#supervisors-and-tasks)
* [Supervisors](#supervisors)
* [Tasks](#tasks)
* [Services](#services)
* [Lookups](#lookups)
You can access the [data loader](#data-loader) and [lookups view](#lookups) from the top-level navigation of the **Home** view.
![home-view](../assets/web-console-01-home-view.png "home view")
![Web console home view](../assets/web-console-01-home-view.png "home view")
## Query
@ -107,7 +107,7 @@ After queries finish, you can access them by clicking on the query time indicato
You can use the data loader to build an ingestion spec with a step-by-step wizard.
![data-loader-1](../assets/web-console-02-data-loader-1.png)
![Data loader tiles](../assets/web-console-02-data-loader-1.png)
After selecting the location of your data, follow the series of steps displaying incremental previews of the data as it is ingested.
After filling in the required details on every step you can navigate to the next step by clicking **Next**.
@ -115,7 +115,7 @@ You can also freely navigate between the steps from the top navigation.
Navigating with the top navigation leaves the underlying spec unmodified while clicking **Next** attempts to fill in the subsequent steps with appropriate defaults.
![data-loader-2](../assets/web-console-03-data-loader-2.png)
![Data loader ingestion](../assets/web-console-03-data-loader-2.png)
## Datasources
@ -127,11 +127,11 @@ To display a timeline of segments, toggle the option for **Show segment timeline
Like any view that is powered by a Druid SQL query, you can click **View SQL query for table** from the ellipsis menu to run the underlying SQL query directly.
![datasources](../assets/web-console-04-datasources.png)
![Datasources](../assets/web-console-04-datasources.png)
You can view and edit retention rules to determine the general availability of a datasource.
![retention](../assets/web-console-05-retention.png)
![Retention](../assets/web-console-05-retention.png)
## Segments
@ -139,38 +139,42 @@ The **Segments** view shows all the [segments](../design/segments.md) in the clu
Each segment has a detail view that provides more information.
The Segment ID is also conveniently broken down into Datasource, Start, End, Version, and Partition columns for ease of filtering and sorting.
![segments](../assets/web-console-06-segments.png)
![Segments](../assets/web-console-06-segments.png)
## Supervisors and tasks
## Supervisors
From this view, you can check the status of existing supervisors as well as suspend, resume, and reset them.
The supervisor oversees the state of the indexing tasks to coordinate handoffs, manage failures, and ensure that the scalability and replication requirements are maintained.
The supervisor oversees the state of the indexing tasks to coordinate handoffs, manage failures, and ensure that the scalability and replication requirements are maintained. Submit a supervisor spec manually by clicking the ellipsis icon and selecting **Submit JSON supervisor**.
![Supervisors](../assets/web-console-07-supervisors.png)
Click the magnifying glass icon for any supervisor to see detailed reports of its progress.
![Supervisors status](../assets/web-console-08-supervisor-status.png)
## Tasks
The tasks table allows you to see the currently running and recently completed tasks.
To navigate your tasks more easily, you can group them by their **Type**, **Datasource**, or **Status**.
Submit a task manually by clicking the ellipsis icon and selecting **Submit JSON task**.
![supervisors](../assets/web-console-07-supervisors.png)
![Tasks](../assets/web-console-0.7-tasks.png)
Click on the magnifying glass for any supervisor to see detailed reports of its progress.
Click the magnifying glass icon for any task to see more detail about it.
![supervisor-status](../assets/web-console-08-supervisor-status.png)
Click on the magnifying glass for any task to see more detail about it.
![tasks-status](../assets/web-console-09-task-status.png)
![Tasks status](../assets/web-console-09-task-status.png)
## Services
The **Services** view lets you see the current status of the nodes making up your cluster.
You can group the nodes by type or by tier to get meaningful summary statistics.
You can group the nodes by **Type** or by **Tier** to get meaningful summary statistics.
![servers](../assets/web-console-10-servers.png)
![Services](../assets/web-console-10-servers.png)
## Lookups
Access the **Lookups** view from the **Lookups** card in the home view or by clicking on the gear icon in the upper right corner.
Access the **Lookups** view from the **Lookups** card in the home view or by clicking the ellipsis icon in the top-level navigation.
Here you can create and edit query time [lookups](../querying/lookups.md).
![lookups](../assets/web-console-13-lookups.png)
![Lookups](../assets/web-console-13-lookups.png)

View File

@ -29,7 +29,7 @@ sidebar_label: "DatasourceMetadata"
Data Source Metadata queries return metadata information for a dataSource. These queries return information about:
* The timestamp of latest ingested event for the dataSource. This is the ingested event without any consideration of rollup.
* The timestamp of the latest ingested event for the dataSource. This is the ingested event without any consideration of rollup.
The grammar for these queries is:

View File

@ -79,7 +79,7 @@ In native queries, lookups can be queried with [dimension specs or extraction fu
Query Execution
---------------
When executing an aggregation query involving lookup functions (like the SQL [`LOOKUP` function](sql-scalar.md#string-functions),
When executing an aggregation query involving lookup functions, like the SQL [`LOOKUP` function](sql-scalar.md#string-functions),
Druid can decide to apply them while scanning and aggregating rows, or to apply them after aggregation is complete. It
is more efficient to apply lookups after aggregation is complete, so Druid will do this if it can. Druid decides this
by checking if the lookup is marked as "injective" or not. In general, you should set this property for any lookup that

View File

@ -75,7 +75,7 @@ stored on this tier.
## Supporting high query concurrency
Druid uses a [segment](../design/segments.md) as its fundamental unit of computation. Processes scan segments in parallel and a given process can scan `druid.processing.numThreads` concurrently. You can add more cores to a cluster to process more data in parallel and increase performance. Size your Druid segments such that any computation over any given segment should complete in at most 500ms. Use the the [`query/segment/time`](../operations/metrics.md#historical) metric to monitor computation times.
Druid uses a [segment](../design/segments.md) as its fundamental unit of computation. Processes scan segments in parallel and a given process can scan `druid.processing.numThreads` concurrently. You can add more cores to a cluster to process more data in parallel and increase performance. Size your Druid segments such that any computation over any given segment should complete in at most 500ms. Use the [`query/segment/time`](../operations/metrics.md#historical) metric to monitor computation times.
Druid internally stores requests to scan segments in a priority queue. If a given query requires scanning
more segments than the total number of available processors in a cluster, and many similarly expensive queries are concurrently

View File

@ -0,0 +1,195 @@
---
id: query-deep-storage
title: "Query from deep storage"
---
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
> Query from deep storage is an [experimental feature](../development/experimental.md).
Druid can query segments that are only stored in deep storage. Running a query from deep storage is slower than running queries from segments that are loaded on Historical processes, but it's a great tool for data that you either access infrequently or where the low latency results that typical Druid queries provide is not necessary. Queries from deep storage can increase the surface area of data available to query without requiring you to scale your Historical processes to accommodate more segments.
## Keep segments in deep storage only
Any data you ingest into Druid is already stored in deep storage, so you don't need to perform any additional configuration from that perspective. However, to take advantage of the cost savings that querying from deep storage provides, make sure not all your segments get loaded onto Historical processes.
To do this, configure [load rules](../operations/rule-configuration.md#load-rules) to manage the which segments are only in deep storage and which get loaded onto Historical processes.
The easiest way to do this is to explicitly configure the segments that don't get loaded onto Historical processes. Set `tieredReplicants` to an empty array and `useDefaultTierForNull` to `false`. For example, if you configure the following rule for a datasource:
```json
[
{
"interval": "2016-06-27T00:00:00.000Z/2016-06-27T02:59:00.000Z",
"tieredReplicants": {},
"useDefaultTierForNull": false,
"type": "loadByInterval"
}
]
```
Any segment that falls within the specified interval exists only in deep storage. For segments that aren't in this interval, they'll use the default cluster load rules or any other load rules you configure.
To configure the load rules through the Druid console, go to **Datasources > ... in the Actions column > Edit retention rules**. Then, paste the provided JSON into the JSON tab:
![](../assets/tutorial-query-deepstorage-retention-rule.png)
You can verify that a segment is not loaded on any Historical tiers by querying the Druid metadata table:
```sql
SELECT "segment_id", "replication_factor" FROM sys."segments" WHERE "replication_factor" = 0 AND "datasource" = YOUR_DATASOURCE
```
Segments with a `replication_factor` of `0` are not assigned to any Historical tiers. Queries against these segments are run directly against the segment in deep storage.
You can also confirm this through the Druid console. On the **Segments** page, see the **Replication factor** column.
Keep the following in mind when working with load rules to control what exists only in deep storage:
- At least one of the segments in a datasource must be loaded onto a Historical process so that Druid can plan the query. The segment on the Historical process can be any segment from the datasource. It does not need to be a specific segment. One way to verify that a datasource has at least one segment on a Historical process is if it's visible in the Druid console.
- The actual number of replicas may differ from the replication factor temporarily as Druid processes your load rules.
## Run a query from deep storage
### Submit a query
You can query data from deep storage by submitting a query to the API using `POST /sql/statements` or the Druid console. Druid uses the multi-stage query (MSQ) task engine to perform the query.
To run a query from deep storage, send your query to the Router using the POST method:
```
POST https://ROUTER:8888/druid/v2/sql/statements
```
Submitting a query from deep storage uses the same syntax as any other Druid SQL query where the query is contained in the "query" field in the JSON object within the request payload. For example:
```json
{"query" : "SELECT COUNT(*) FROM data_source WHERE foo = 'bar'"}
```
Generally, the request body fields are the same between the `sql` and `sql/statements` endpoints.
There are additional context parameters for `sql/statements` specifically:
- `executionMode` (required) determines how query results are fetched. Set this to `ASYNC`.
- `selectDestination` (optional) set to `durableStorage` instructs Druid to write the results from SELECT queries to durable storage. Note that this requires you to have [durable storage for MSQ enabled](../operations/durable-storage.md).
The following sample query includes the two additional context parameters that querying from deep storage supports:
```
curl --location 'http://localhost:8888/druid/v2/sql/statements' \
--header 'Content-Type: application/json' \
--data '{
"query":"SELECT * FROM \"YOUR_DATASOURCE\" where \"__time\" >TIMESTAMP'\''2017-09-01'\'' and \"__time\" <= TIMESTAMP'\''2017-09-02'\''",
"context":{
"executionMode":"ASYNC",
"selectDestination": "durableStorage"
}
}'
```
The response for submitting a query includes the query ID along with basic information, such as when you submitted the query and the schema of the results:
```json
{
"queryId": "query-ALPHANUMBERIC-STRING",
"state": "ACCEPTED",
"createdAt": CREATION_TIMESTAMP,
"schema": [
{
"name": COLUMN_NAME,
"type": COLUMN_TYPE,
"nativeType": COLUMN_TYPE
},
...
],
"durationMs": DURATION_IN_MS,
}
```
### Get query status
You can check the status of a query with the following API call:
```
GET https://ROUTER:8888/druid/v2/sql/statements/QUERYID
```
The query returns the status of the query, such as `ACCEPTED` or `RUNNING`. Before you attempt to get results, make sure the state is `SUCCESS`.
When you check the status on a successful query, it includes useful information about your query results including a sample record and information about how the results are organized by `pages`. The information for each page includes the following:
- `numRows`: the number of rows in that page of results
- `sizeInBytes`: the size of the page
- `id`: the indexed page number that you can use to reference a specific page when you get query results
You can use `page` as a parameter to refine the results you retrieve.
The following snippet shows the structure of the `result` object:
```json
{
...
"result": {
"numTotalRows": INTEGER,
"totalSizeInBytes": INTEGER,
"dataSource": "__query_select",
"sampleRecords": [
[
RECORD_1,
RECORD_2,
...
]
],
"pages": [
{
"numRows": INTEGER,
"sizeInBytes": INTEGER,
"id": INTEGER_PAGE_NUMBER
}
...
]
}
}
```
### Get query results
Only the user who submitted a query can retrieve the results for the query.
Use the following endpoint to retrieve results:
```
GET https://ROUTER:8888/druid/v2/sql/statements/QUERYID/results?page=PAGENUMBER&size=RESULT_SIZE&timeout=TIMEOUT_MS
```
Results are returned in JSON format.
You can use the optional `page`, `size`, and `timeout` parameters to refine your results. You can retrieve the `page` information for your results by fetching the status of the completed query.
When you try to get results for a query from deep storage, you may receive an error that states the query is still running. Wait until the query completes before you try again.
## Further reading
* [Query from deep storage tutorial](../tutorials/tutorial-query-deep-storage.md)
* [Query from deep storage API reference](../api-reference/sql-api.md#query-from-deep-storage)

View File

@ -57,7 +57,7 @@ are designed to be lightweight and complete very quickly. This means that for mo
more complex visualizations, multiple Druid queries may be required.
Even though queries are typically made to Brokers or Routers, they can also be accepted by
[Historical](../design/historical.md) processes and by [Peons (task JVMs)](../design/peons.md)) that are running
[Historical](../design/historical.md) processes and by [Peons (task JVMs)](../design/peons.md) that are running
stream ingestion tasks. This may be valuable if you want to query results for specific segments that are served by
specific processes.

View File

@ -159,7 +159,7 @@ If any part of a dimension value contains the value specified in this search que
### `fragment`
If any part of a dimension value contains all of the values specified in this search query spec, regardless of case by default, a "match" occurs. The grammar is:
If any part of a dimension value contains all the values specified in this search query spec, regardless of case by default, a "match" occurs. The grammar is:
```json
{

View File

@ -62,7 +62,8 @@ There are several main parts to a segment metadata query:
|merge|Merge all individual segment metadata results into a single result|no|
|context|See [Context](../querying/query-context.md)|no|
|analysisTypes|A list of Strings specifying what column properties (e.g. cardinality, size) should be calculated and returned in the result. Defaults to ["cardinality", "interval", "minmax"], but can be overridden with using the [segment metadata query config](../configuration/index.md#segmentmetadata-query-config). See section [analysisTypes](#analysistypes) for more details.|no|
|lenientAggregatorMerge|If true, and if the "aggregators" analysisType is enabled, aggregators will be merged leniently. See below for details.|no|
|aggregatorMergeStrategy| The strategy Druid uses to merge aggregators across segments. If true and if the `aggregators` analysis type is enabled, `aggregatorMergeStrategy` defaults to `strict`. Possible values include `strict`, `lenient`, `earliest`, and `latest`. See [`aggregatorMergeStrategy`](#aggregatormergestrategy) for details.|no|
|lenientAggregatorMerge|Deprecated. Use `aggregatorMergeStrategy` property instead. If true, and if the `aggregators` analysis type is enabled, Druid merges aggregators leniently.|no|
The format of the result is:
@ -185,7 +186,7 @@ Currently, there is no API for retrieving this information.
* `aggregators` in the result will contain the list of aggregators usable for querying metric columns. This may be
null if the aggregators are unknown or unmergeable (if merging is enabled).
* Merging can be strict or lenient. See *lenientAggregatorMerge* below for details.
* Merging can be `strict`, `lenient`, `earliest`, or `latest`. See [`aggregatorMergeStrategy`](#aggregatormergestrategy) for details.
* The form of the result is a map of column name to aggregator.
@ -194,15 +195,22 @@ null if the aggregators are unknown or unmergeable (if merging is enabled).
* `rollup` in the result is true/false/null.
* When merging is enabled, if some are rollup, others are not, result is null.
## lenientAggregatorMerge
### aggregatorMergeStrategy
Conflicts between aggregator metadata across segments can occur if some segments have unknown aggregators, or if
two segments use incompatible aggregators for the same column (e.g. longSum changed to doubleSum).
two segments use incompatible aggregators for the same column, such as `longSum` changed to `doubleSum`.
Druid supports the following aggregator merge strategies:
Aggregators can be merged strictly (the default) or leniently. With strict merging, if there are any segments
with unknown aggregators, or any conflicts of any kind, the merged aggregators list will be `null`. With lenient
merging, segments with unknown aggregators will be ignored, and conflicts between aggregators will only null out
the aggregator for that particular column.
- `strict`: If there are any segments with unknown aggregators or any conflicts of any kind, the merged aggregators
list is `null`.
- `lenient`: Druid ignores segments with unknown aggregators. Conflicts between aggregators set the aggregator for
that particular column to null.
- `earliest`: In the event of conflicts between segments, Druid selects the aggregator from the earliest segment
for that particular column.
- `latest`: In the event of conflicts between segments, Druid selects the aggregator from the most recent segment
for that particular column.
In particular, with lenient merging, it is possible for an individual column's aggregator to be `null`. This will not
occur with strict merging.
### lenientAggregatorMerge (deprecated)
Deprecated. Use [`aggregatorMergeStrategy`](#aggregatormergestrategy) instead.

View File

@ -133,7 +133,7 @@ The [basic cluster tuning guide](../operations/basic-cluster-tuning.md) has info
We recommend running your favorite Linux distribution. You will also need
* [Java 8 or 11](../operations/java.md).
* [Java 8u92+, 11, or 17](../operations/java.md)
* Python 2 or Python 3
> If needed, you can specify where to find Java using the environment variables

View File

@ -40,7 +40,7 @@ You can follow these steps on a relatively modest machine, such as a workstation
The software requirements for the installation machine are:
* Linux, Mac OS X, or other Unix-like OS. (Windows is not supported)
* [Java 8u92+ or Java 11](../operations/java.md)
* [Java 8u92+, 11, or 17](../operations/java.md)
* Python 3 (preferred) or Python 2
* Perl 5

View File

@ -33,12 +33,15 @@ You can run the following combination of applications:
* [Jupyter only](#start-only-the-jupyter-container)
* [Jupyter and Druid](#start-jupyter-and-druid)
* [Jupyter, Druid, and Kafka](#start-jupyter-druid-and-kafka)
* [Kafka and Jupyter](#start-kafka-and-jupyter)
## Prerequisites
Jupyter in Docker requires that you have **Docker** and **Docker Compose**.
We recommend installing these through [Docker Desktop](https://docs.docker.com/desktop/).
For ARM-based devices, see [Tutorial setup for ARM-based devices](#tutorial-setup-for-arm-based-devices).
## Launch the Docker containers
You run Docker Compose to launch Jupyter and optionally Druid or Kafka.
@ -53,7 +56,7 @@ access the files in `druid/examples/quickstart/jupyter-notebooks/docker-jupyter`
### Start only the Jupyter container
If you already have Druid running locally, you can run only the Jupyter container to complete the tutorials.
If you already have Druid running locally or on another machine, you can run the Docker containers for Jupyter only.
In the same directory as `docker-compose.yaml`, start the application:
```bash
@ -63,6 +66,11 @@ docker compose --profile jupyter up -d
The Docker Compose file assigns `8889` for the Jupyter port.
You can override the port number by setting the `JUPYTER_PORT` environment variable before starting the Docker application.
If Druid is running local to the same machine as Jupyter, open the tutorial and set the `host` variable to `host.docker.internal` before starting. For example:
```python
host = "host.docker.internal"
```
### Start Jupyter and Druid
Running Druid in Docker requires the `environment` file as well as an environment variable named `DRUID_VERSION`,
@ -85,6 +93,26 @@ In the same directory as `docker-compose.yaml` and `environment`, start the appl
DRUID_VERSION={{DRUIDVERSION}} docker compose --profile all-services up -d
```
### Start Kafka and Jupyter
If you already have Druid running externally, such as an existing cluster or a dedicated infrastructure for Druid, you can run the Docker containers for Kafka and Jupyter only.
In the same directory as `docker-compose.yaml` and `environment`, start the application:
```bash
DRUID_VERSION={{DRUIDVERSION}} docker compose --profile kafka-jupyter up -d
```
If you have an external Druid instance running on a different machine than the one hosting the Docker Compose environment, change the `host` variable in the notebook tutorial to the hostname or address of the machine where Druid is running.
If Druid is running local to the same machine as Jupyter, open the tutorial and set the `host` variable to `host.docker.internal` before starting. For example:
```python
host = "host.docker.internal"
```
To enable Druid to ingest data from Kafka within the Docker Compose environment, update the `bootstrap.servers` property in the Kafka ingestion spec to `localhost:9094` before ingesting. For reference, see [more on consumer properties](../development/extensions-core/kafka-supervisor-reference.md#more-on-consumerproperties).
### Update image from Docker Hub
If you already have a local cache of the Jupyter image, you can update the image before running the application using the following command:
@ -193,9 +221,30 @@ as well as the [Python client for Druid](tutorial-jupyter-index.md#python-api-fo
You should now be able to access and complete the tutorials.
## Tutorial setup for ARM-based devices
For ARM-based devices, follow this setup to start Druid externally, while keeping Kafka and Jupyter within the Docker Compose environment:
1. Start Druid using the `start-druid` script. You can follow [Quickstart (local)](./index.md) instructions. The tutorials
assume that you are using the quickstart, so no authentication or authorization is expected unless explicitly mentioned.
2. Start either Jupyter only or Jupyter and Kafka using the following commands in the same directory as `docker-compose.yaml` and `environment`:
```bash
# Start only Jupyter
docker compose --profile jupyter up -d
# Start Kafka and Jupyter
DRUID_VERSION={{DRUIDVERSION}} docker compose --profile kafka-jupyter up -d
```
3. If Druid is running local to the same machine as Jupyter, open the tutorial and set the `host` variable to `host.docker.internal` before starting. For example:
```python
host = "host.docker.internal"
```
4. If using Kafka to handle the data stream that will be ingested into Druid and Druid is running local to the same machine, update the consumer property `bootstrap.servers` to `localhost:9094`.
## Learn more
See the following topics for more information:
* [Jupyter Notebook tutorials](tutorial-jupyter-index.md) for the available Jupyter Notebook-based tutorials for Druid
* [Tutorial: Run with Docker](docker.md) for running Druid from a Docker container
* [Tutorial: Run with Docker](docker.md) for running Druid from a Docker container

View File

@ -76,8 +76,7 @@ In this section, you download sample data to the tutorial's directory and send t
2. Download the sample data to your new directory and extract it:
```bash
cd sample-data
curl -O https://static.imply.io/example-data/kttm-nested-v2/kttm-nested-v2-2019-08-25.json.gz
(cd sample-data && curl -O https://static.imply.io/example-data/kttm-nested-v2/kttm-nested-v2-2019-08-25.json.gz)
```
3. In your Kafka root directory, run the following commands to post sample events to the `kttm` Kafka topic:

View File

@ -0,0 +1,293 @@
---
id: tutorial-query-deep-storage
title: "Tutorial: Query from deep storage"
sidebar_label: "Query from deep storage"
---
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
> Query from deep storage is an [experimental feature](../development/experimental.md).
Query from deep storage allows you to query segments that are stored only in deep storage, which provides lower costs than if you were to load everything onto Historical processes. The tradeoff is that queries from deep storage may take longer to complete.
This tutorial walks you through loading example data, configuring load rules so that not all the segments get loaded onto Historical processes, and querying data from deep storage.
To run the queries in this tutorial, replace `ROUTER:PORT` with the location of the Router process and its port number. For example, use `localhost:8888` for the quickstart deployment.
For more general information, see [Query from deep storage](../querying/query-from-deep-storage.md).
## Load example data
Use the **Load data** wizard or the following SQL query to ingest the `wikipedia` sample datasource bundled with Druid. If you use the wizard, make sure you change the partitioning to be by hour.
Partitioning by hour provides more segment granularity, so you can selectively load segments onto Historicals or keep them in deep storage.
<details><summary>Show the query</summary>
```sql
REPLACE INTO "wikipedia" OVERWRITE ALL
WITH "ext" AS (SELECT *
FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://druid.apache.org/data/wikipedia.json.gz"]}',
'{"type":"json"}'
)
) EXTEND ("isRobot" VARCHAR, "channel" VARCHAR, "timestamp" VARCHAR, "flags" VARCHAR, "isUnpatrolled" VARCHAR, "page" VARCHAR, "diffUrl" VARCHAR, "added" BIGINT, "comment" VARCHAR, "commentLength" BIGINT, "isNew" VARCHAR, "isMinor" VARCHAR, "delta" BIGINT, "isAnonymous" VARCHAR, "user" VARCHAR, "deltaBucket" BIGINT, "deleted" BIGINT, "namespace" VARCHAR, "cityName" VARCHAR, "countryName" VARCHAR, "regionIsoCode" VARCHAR, "metroCode" BIGINT, "countryIsoCode" VARCHAR, "regionName" VARCHAR))
SELECT
TIME_PARSE("timestamp") AS "__time",
"isRobot",
"channel",
"flags",
"isUnpatrolled",
"page",
"diffUrl",
"added",
"comment",
"commentLength",
"isNew",
"isMinor",
"delta",
"isAnonymous",
"user",
"deltaBucket",
"deleted",
"namespace",
"cityName",
"countryName",
"regionIsoCode",
"metroCode",
"countryIsoCode",
"regionName"
FROM "ext"
PARTITIONED BY HOUR
```
</details>
## Configure a load rule
The load rule configures Druid to keep any segments that fall within the following interval only in deep storage:
```
2016-06-27T00:00:00.000Z/2016-06-27T02:59:00.000Z
```
The JSON form of the rule is as follows:
```json
[
{
"interval": "2016-06-27T00:00:00.000Z/2016-06-27T02:59:00.000Z",
"tieredReplicants": {},
"useDefaultTierForNull": false,
"type": "loadByInterval"
}
]
```
The rest of the segments use the default load rules for the cluster. For the quickstart, that means all the other segments get loaded onto Historical processes.
You can configure the load rules through the API or the Druid console. To configure the load rules through the Druid console, go to **Datasources > ... in the Actions column > Edit retention rules**. Then, paste the provided JSON into the JSON tab:
![](../assets/tutorial-query-deepstorage-retention-rule.png)
### Verify the replication factor
Segments that are only available from deep storage have a `replication_factor` of 0 in the Druid system table. You can verify that your load rule worked as intended using the following query:
```sql
SELECT "segment_id", "replication_factor", "num_replicas" FROM sys."segments" WHERE datasource = 'wikipedia'
```
You can also verify it through the Druid console by checking the **Replication factor** column in the **Segments** view.
Note that the number of replicas and replication factor may differ temporarily as Druid processes your retention rules.
## Query from deep storage
Now that there are segments that are only available from deep storage, run the following query:
```sql
SELECT page FROM wikipedia WHERE __time < TIMESTAMP'2016-06-27 00:10:00' LIMIT 10
```
With the context parameter:
```json
"executionMode": "ASYNC"
```
For example, run the following curl command:
```
curl --location 'http://localhost:8888/druid/v2/sql/statements' \
--header 'Content-Type: application/json' \
--data '{
"query":"SELECT page FROM wikipedia WHERE __time < TIMESTAMP'\''2016-06-27 00:10:00'\'' LIMIT 10",
"context":{
"executionMode":"ASYNC"
}
}'
```
This query looks for records with timestamps that precede `00:10:00`. Based on the load rule you configured earlier, this data is only available from deep storage.
When you submit the query from deep storage through the API, you get the following response:
<details><summary>Show the response</summary>
```json
{
"queryId": "query-6888b6f6-e597-456c-9004-222b05b97051",
"state": "ACCEPTED",
"createdAt": "2023-07-28T21:59:02.334Z",
"schema": [
{
"name": "page",
"type": "VARCHAR",
"nativeType": "STRING"
}
],
"durationMs": -1
}
```
Make sure you note the `queryID`. You'll need it to interact with the query.
</details>
Compare this to if you were to submit the query to Druid SQL's regular endpoint, `POST /sql`:
```
curl --location 'http://localhost:8888/druid/v2/sql/' \
--header 'Content-Type: application/json' \
--data '{
"query":"SELECT page FROM wikipedia WHERE __time < TIMESTAMP'\''2016-06-27 00:10:00'\'' LIMIT 10",
"context":{
"executionMode":"ASYNC"
}
}'
```
The response you get back is an empty response cause there are no records on the Historicals that match the query.
## Get query status
Replace `:queryId` with the ID for your query and run the following curl command to get your query status:
```
curl --location --request GET 'http://localhost:8888/druid/v2/sql/statements/:queryId' \
--header 'Content-Type: application/json' \
```
### Response for a running query
The response for a running query is the same as the response from when you submitted the query except the `state` is `RUNNING` instead of `ACCEPTED`.
### Response for a completed query
A successful query also returns a `pages` object that includes the page numbers (`id`), rows per page (`numRows`), and the size of the page (`sizeInBytes`). You can pass the page number as a parameter when you get results to refine the results you get.
Note that `sampleRecords` has been truncated for brevity.
<details><summary>Show the response</summary>
```json
{
"queryId": "query-6888b6f6-e597-456c-9004-222b05b97051",
"state": "SUCCESS",
"createdAt": "2023-07-28T21:59:02.334Z",
"schema": [
{
"name": "page",
"type": "VARCHAR",
"nativeType": "STRING"
}
],
"durationMs": 87351,
"result": {
"numTotalRows": 152,
"totalSizeInBytes": 9036,
"dataSource": "__query_select",
"sampleRecords": [
[
"Salo Toraut"
],
[
"利用者:ワーナー成増/放送ウーマン賞"
],
[
"Bailando 2015"
],
...
...
...
],
"pages": [
{
"id": 0,
"numRows": 152,
"sizeInBytes": 9036
}
]
}
}
```
</details>
## Get query results
Replace `:queryId` with the ID for your query and run the following curl command to get your query results:
```
curl --location 'http://ROUTER:PORT/druid/v2/sql/statements/:queryId'
```
Note that the response has been truncated for brevity.
<details><summary>Show the response</summary>
```json
[
{
"page": "Salo Toraut"
},
{
"page": "利用者:ワーナー成増/放送ウーマン賞"
},
{
"page": "Bailando 2015"
},
...
...
...
]
```
</details>
## Further reading
* [Query from deep storage](../querying/query-from-deep-storage.md)
* [Query from deep storage API reference](../api-reference/sql-api.md#query-from-deep-storage)

View File

@ -17,6 +17,7 @@
## Initialization script for druid nodes
## Runs druid nodes as a daemon
## Environment Variables used by this script -
## DRUID_BIN_DIR - directory having druid bin files, default=bin
## DRUID_LIB_DIR - directory having druid jar files, default=lib
## DRUID_CONF_DIR - directory having druid config files, default=conf/druid
## DRUID_LOG_DIR - directory used to store druid logs, default=log
@ -36,11 +37,11 @@ shift
command=$1
LIB_DIR="${DRUID_LIB_DIR:=lib}"
BIN_DIR="${DRUID_BIN_DIR:=$DRUID_LIB_DIR/../bin}"
CONF_DIR="${DRUID_CONF_DIR:=conf/druid}"
PID_DIR="${DRUID_PID_DIR:=var/druid/pids}"
WHEREAMI="$(dirname "$0")"
WHEREAMI="$(cd "$WHEREAMI" && pwd)"
JAVA_BIN_DIR="$(. /"$WHEREAMI"/java-util && get_java_bin_dir)"
# Remove possilble ending slash
LOG_DIR="${DRUID_LOG_DIR:=${WHEREAMI}/log}"
@ -64,13 +65,7 @@ case $command in
if [ ! -d "$PID_DIR" ]; then mkdir -p $PID_DIR; fi
if [ ! -d "$LOG_DIR" ]; then mkdir -p $LOG_DIR; fi
if [ -z "$JAVA_BIN_DIR" ]; then
echo "Could not find java - please run $WHEREAMI/verify-java to confirm it is installed."
exit 1
fi
JAVA="$JAVA_BIN_DIR/java"
nohup $JAVA -Ddruid.node.type=$nodeType "-Ddruid.log.path=$LOG_DIR" `cat $CONF_DIR/$nodeType/jvm.config | xargs` -cp $CONF_DIR/_common:$CONF_DIR/$nodeType:$LIB_DIR/*:$HADOOP_CONF_DIR org.apache.druid.cli.Main server $nodeType >> /dev/null 2>&1 &
nohup "$BIN_DIR/run-java" -Ddruid.node.type=$nodeType "-Ddruid.log.path=$LOG_DIR" `cat $CONF_DIR/$nodeType/jvm.config | xargs` -cp $CONF_DIR/_common:$CONF_DIR/$nodeType:$LIB_DIR/*:$HADOOP_CONF_DIR org.apache.druid.cli.Main server $nodeType >> /dev/null 2>&1 &
nodeType_PID=$!
echo $nodeType_PID > $pid
echo "Started $nodeType node, pid: $nodeType_PID"

View File

@ -26,31 +26,24 @@ fi
JAVA_MAJOR="$("$JAVA_BIN" -version 2>&1 | sed -n -E 's/.* version "([^."]*).*/\1/p')"
if [ "$JAVA_MAJOR" != "" ] && [ "$JAVA_MAJOR" -ge "17" ]
if [ "$JAVA_MAJOR" != "" ] && [ "$JAVA_MAJOR" -ge "11" ]
then
# Must disable strong encapsulation for certain packages on Java 17.
# The last one is required for metric emit, see https://github.com/apache/druid/issues/12312
# Disable strong encapsulation for certain packages on Java 11+.
# When updating this list, update all four:
# 1) ForkingTaskRunner#STRONG_ENCAPSULATION_PROPERTIES
# 2) docs/operations/java.md, "Strong encapsulation" section
# 3) pom.xml, jdk.strong.encapsulation.argLine
# 4) examples/bin/run-java script (here)
exec "$JAVA_BIN" \
--add-exports=java.base/jdk.internal.misc=ALL-UNNAMED \
--add-exports=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/java.nio=ALL-UNNAMED \
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/java.io=ALL-UNNAMED \
--add-opens=java.base/java.lang=ALL-UNNAMED \
--add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED \
"$@"
elif [ "$JAVA_MAJOR" != "" ] && [ "$JAVA_MAJOR" -ge "11" ]
then
# The first 4 parameters below are required to use datasketches-memory as a library.
# And the last one is required for metric emit, see https://github.com/apache/druid/issues/12312
exec "$JAVA_BIN" \
--add-exports=java.base/jdk.internal.misc=ALL-UNNAMED \
--add-exports=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/java.nio=ALL-UNNAMED \
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
--add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED \
"$@"
else
exec "$JAVA_BIN" "$@"
fi

View File

@ -28,14 +28,14 @@ sub fail_check {
: "No Java runtime was detected on your system.";
print STDERR <<"EOT";
Druid requires Java 8 or 11. $current_version_text
Druid requires Java 8, 11, or 17. $current_version_text
If you believe this check is in error, or you want to proceed with a potentially
unsupported Java runtime, you can skip this check using an environment variable:
export DRUID_SKIP_JAVA_CHECK=1
Otherwise, install Java 8 or 11 in one of the following locations.
Otherwise, install Java 8, 11, or 17 in one of the following locations.
* DRUID_JAVA_HOME
* JAVA_HOME
@ -68,6 +68,6 @@ if ($?) {
}
# If we know it won't work, die. Otherwise hope for the best.
if ($java_version =~ /version \"((\d+)\.(\d+).*?)\"/ && !($2 == 1 && $3 == 8) && $2 != 11 ) {
if ($java_version =~ /version \"((\d+)\.(\d+).*?)\"/ && !($2 == 1 && $3 == 8) && $2 != 11 && $2 != 17 ) {
fail_check($1);
}

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.plaintextPort=8091
# https://druid.apache.org/docs/latest/operations/basic-cluster-tuning.html#middlemanager
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Processing threads and buffers on Peons

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.worker.capacity=4
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.indexer.runner.javaOptsArray=["-server","-Xms1g","-Xmx1g","-XX:MaxDirectMemorySize=1g","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
# HTTP server threads

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.worker.capacity=8
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.indexer.runner.javaOptsArray=["-server","-Xms1g","-Xmx1g","-XX:MaxDirectMemorySize=1g","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
# HTTP server threads

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.worker.capacity=4
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.indexer.runner.javaOptsArray=["-server","-Xms1g","-Xmx1g","-XX:MaxDirectMemorySize=1g","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
# HTTP server threads

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.worker.capacity=2
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.indexer.runner.javaOptsArray=["-server","-Xms1g","-Xmx1g","-XX:MaxDirectMemorySize=1g","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
# HTTP server threads

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.worker.capacity=2
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.indexer.runner.javaOptsArray=["-server","-Xms256m","-Xmx256m","-XX:MaxDirectMemorySize=300m","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
# HTTP server threads

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.worker.capacity=3
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.indexer.runner.javaOptsArray=["-server","-Xms1g","-Xmx1g","-XX:MaxDirectMemorySize=1g","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
# HTTP server threads

View File

@ -119,7 +119,7 @@ druid.selectors.coordinator.serviceName=druid/coordinator
# Monitoring
#
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor"]
druid.monitoring.monitors=["org.apache.druid.java.util.metrics.JvmMonitor", "org.apache.druid.server.metrics.ServiceStatusMonitor"]
druid.emitter=noop
druid.emitter.logging.logLevel=info

View File

@ -25,7 +25,6 @@ druid.worker.capacity=16
druid.worker.baseTaskDirs=[\"var/druid/task\"]
# Task launch parameters
druid.indexer.runner.javaCommand=bin/run-java
druid.indexer.runner.javaOptsArray=["-server","-Xms1g","-Xmx1g","-XX:MaxDirectMemorySize=1g","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
# HTTP server threads

View File

@ -58,18 +58,20 @@ services:
# To learn about configuring Kafka for access across networks see
# https://www.confluent.io/blog/kafka-client-cannot-connect-to-broker-on-aws-on-docker-etc/
- "9092:9092"
- '9094:9094'
depends_on:
- zookeeper
environment:
- KAFKA_BROKER_ID=1
- KAFKA_CFG_LISTENERS=PLAINTEXT://:9092
- KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
- KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094
- KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092,EXTERNAL://localhost:9094
- KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,EXTERNAL:PLAINTEXT,PLAINTEXT:PLAINTEXT
- KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
- ALLOW_PLAINTEXT_LISTENER=yes
- KAFKA_ENABLE_KRAFT=false
coordinator:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: coordinator
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -86,7 +88,7 @@ services:
- environment
broker:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: broker
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -103,7 +105,7 @@ services:
- environment
historical:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: historical
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -121,7 +123,7 @@ services:
- environment
middlemanager:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: middlemanager
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -140,7 +142,7 @@ services:
- environment
router:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: router
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -167,6 +169,8 @@ services:
JUPYTER_TOKEN: "docker"
DOCKER_STACKS_JUPYTER_CMD: "lab"
NOTEBOOK_ARGS: "--NotebookApp.token=''"
DRUID_HOST: "${DRUID_HOST:-router}"
KAFKA_HOST: "${KAFKA_HOST:-kafka}"
ports:
- "${JUPYTER_PORT:-8889}:8888"
volumes:

View File

@ -58,18 +58,20 @@ services:
# To learn about configuring Kafka for access across networks see
# https://www.confluent.io/blog/kafka-client-cannot-connect-to-broker-on-aws-on-docker-etc/
- "9092:9092"
- '9094:9094'
depends_on:
- zookeeper
environment:
- KAFKA_BROKER_ID=1
- KAFKA_CFG_LISTENERS=PLAINTEXT://:9092
- KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092
- KAFKA_CFG_LISTENERS=PLAINTEXT://:9092,CONTROLLER://:9093,EXTERNAL://:9094
- KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092,EXTERNAL://localhost:9094
- KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CONTROLLER:PLAINTEXT,EXTERNAL:PLAINTEXT,PLAINTEXT:PLAINTEXT
- KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
- ALLOW_PLAINTEXT_LISTENER=yes
- KAFKA_ENABLE_KRAFT=false
coordinator:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: coordinator
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -86,7 +88,7 @@ services:
- environment
broker:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: broker
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -103,7 +105,7 @@ services:
- environment
historical:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: historical
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -121,7 +123,7 @@ services:
- environment
middlemanager:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: middlemanager
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -140,7 +142,7 @@ services:
- environment
router:
image: apache/druid:${DRUID_VERSION}
image: apache/druid:${DRUID_VERSION:-26.0.0}
container_name: router
profiles: ["druid-jupyter", "all-services"]
volumes:
@ -165,6 +167,8 @@ services:
JUPYTER_TOKEN: "docker"
DOCKER_STACKS_JUPYTER_CMD: "lab"
NOTEBOOK_ARGS: "--NotebookApp.token=''"
DRUID_HOST: "${DRUID_HOST:-router}"
KAFKA_HOST: "${KAFKA_HOST:-kafka}"
ports:
- "${JUPYTER_PORT:-8889}:8888"
volumes:

View File

@ -0,0 +1,129 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0cb3b009-ebde-4d56-9d59-a028d66d8309",
"metadata": {},
"source": [
"# Title\n",
"<!--\n",
" ~ Licensed to the Apache Software Foundation (ASF) under one\n",
" ~ or more contributor license agreements. See the NOTICE file\n",
" ~ distributed with this work for additional information\n",
" ~ regarding copyright ownership. The ASF licenses this file\n",
" ~ to you under the Apache License, Version 2.0 (the\n",
" ~ \"License\"); you may not use this file except in compliance\n",
" ~ with the License. You may obtain a copy of the License at\n",
" ~\n",
" ~ http://www.apache.org/licenses/LICENSE-2.0\n",
" ~\n",
" ~ Unless required by applicable law or agreed to in writing,\n",
" ~ software distributed under the License is distributed on an\n",
" ~ \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
" ~ KIND, either express or implied. See the License for the\n",
" ~ specific language governing permissions and limitations\n",
" ~ under the License.\n",
" -->\n",
"Introduction to Notebook\n",
"Lorem Ipsum"
]
},
{
"cell_type": "markdown",
"id": "bbdbf6ad-ca7b-40f5-8ca3-1070f4a3ee42",
"metadata": {},
"source": [
"## Prerequisites\n",
"\n",
"This tutorial works with Druid XX.0.0 or later.\n",
"\n",
"Launch this tutorial and all prerequisites using the `all-services` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Docker for Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-docker.html).\n"
]
},
{
"cell_type": "markdown",
"id": "7ee6aef8-a11d-48d5-bcdc-e6231ba594b7",
"metadata": {},
"source": [
"<details><summary> \n",
"<b>Run without Docker Compose</b> \n",
"</summary>\n",
"\n",
"In order to run this notebook you will need:\n",
"\n",
"<b>Required Services</b>\n",
"* <!-- include list of components needed for notebook, i.e. kafka, druid instance, etc. -->\n",
"\n",
"<b>Python packages</b>\n",
"* druidapi, a [Python client for Apache Druid](https://github.com/apache/druid/blob/master/examples/quickstart/jupyter-notebooks/druidapi/README.md)\n",
"* <!-- include any python package dependencies -->\n",
"</details>"
]
},
{
"cell_type": "markdown",
"id": "5007a243-b81a-4601-8f57-5b14940abbff",
"metadata": {},
"source": [
"### Initialization"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1ec783b-df3f-4168-9be2-cdc6ad3e33c2",
"metadata": {},
"outputs": [],
"source": [
"import druidapi\n",
"import os\n",
"\n",
"if 'DRUID_HOST' not in os.environ.keys():\n",
" druid_host=f\"http://localhost:8888\"\n",
"else:\n",
" druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
" \n",
"print(f\"Opening a connection to {druid_host}.\")\n",
"druid = druidapi.jupyter_client(druid_host)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c075de81-04c9-4b23-8253-20a15d46252e",
"metadata": {},
"outputs": [],
"source": [
"# INCLUDE THIS CELL IF YOUR NOTEBOOK USES KAFKA \n",
"# Use kafka_host variable when connecting to kafka \n",
"import os\n",
"\n",
"if 'KAFKA_HOST' not in os.environ.keys():\n",
" kafka_host=f\"http://localhost:9092\"\n",
"else:\n",
" kafka_host=f\"{os.environ['KAFKA_HOST']}:9092\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,416 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "e4a4ffd8-8aa5-4b6e-b60a-f4ef14049c46",
"metadata": {},
"source": [
"## Druid 26.0 release notebook"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "3a008975-3100-417b-8ddc-623857d5ad6a",
"metadata": {
"tags": []
},
"source": [
"<!--\n",
" ~ Licensed to the Apache Software Foundation (ASF) under one\n",
" ~ or more contributor license agreements. See the NOTICE file\n",
" ~ distributed with this work for additional information\n",
" ~ regarding copyright ownership. The ASF licenses this file\n",
" ~ to you under the Apache License, Version 2.0 (the\n",
" ~ \"License\"); you may not use this file except in compliance\n",
" ~ with the License. You may obtain a copy of the License at\n",
" ~\n",
" ~ http://www.apache.org/licenses/LICENSE-2.0\n",
" ~\n",
" ~ Unless required by applicable law or agreed to in writing,\n",
" ~ software distributed under the License is distributed on an\n",
" ~ \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
" ~ KIND, either express or implied. See the License for the\n",
" ~ specific language governing permissions and limitations\n",
" ~ under the License.\n",
" -->\n",
" \n",
"This notebook highlights some of the new features released in Druid 26.0.\n",
"\n",
"Before you begin, ensure you have the following:\n",
"* The `pandas` Python package\n",
"* The `requests` Python package\n",
"* A running Druid instance.\n",
"* Jupyter Lab or Jupyter Notebook running on a non-default port. By default, Druid and Jupyter both try to use port 8888, so start Jupyter on a different port. For more information on using Jupyter notebooks with Druid, see [Jupyter Notebook tutorials](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-index.html).\n",
"\n",
"## Features\n",
"* [Schema auto-discovery](#Schema-auto-discovery)\n",
"* [Shuffle join](#Shuffle-join)\n",
"* [UNNEST and arrays](#UNNEST-and-arrays)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "f02a76ed-8600-4afa-a37e-c3519005e2ab",
"metadata": {},
"source": [
"## Verify Druid version"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18cc6a82-0167-423c-b14d-01c36ac2733d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import requests\n",
"\n",
"druid_host = \"http://localhost:8888\"\n",
"session = requests.Session()\n",
"endpoint = druid_host + '/status'\n",
"response = session.get(endpoint)\n",
"json = response.json()\n",
"print(\"Running on Druid version: \"+ json[\"version\"])"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "c39b6caf-e08a-41c0-9021-12ee270023c1",
"metadata": {
"tags": []
},
"source": [
"## Schema auto-discovery\n",
"\n",
"### What would happen in the past if we just load this data?\n",
"\n",
"Previously, Druid already supports [string-based schema auto-discovery](https://druid.apache.org/docs/latest/ingestion/schema-design.html#string-based-schema-discovery), but it has some limitations. Specifically, all the newly discovered columns will be stored as string types. This means aggregation queries on numerical columns can be slow (since they need to be parsed as numbers first), and some fields such as multi-value dimensions with null values can misbehave.\n",
"\n",
"With the introduction of [type-aware schema auto-discovery](https://druid.apache.org/docs/latest/ingestion/schema-design.html#type-aware-schema-discovery), Druid now properly infers data types. Set this in an ingestion job by including `\"useSchemaDiscovery\": True` in the `dimensionsSpec` object. In the example below, you perform a batch ingestion job and instruct Druid to automatically infer the input data types as long, float, string, etc. Run the following cell, then go to the [web console](http://localhost:8888/unified-console.html#ingestion) to check the progress of your ingestion task."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee16e5bc-7e7a-4da5-9816-99d161100522",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"from IPython.display import JSON\n",
"ingestion_spec = {\n",
" \"type\": \"index_parallel\",\n",
" \"spec\": {\n",
" \"ioConfig\": {\n",
" \"type\": \"index_parallel\",\n",
" \"inputSource\": {\n",
" \"type\": \"http\",\n",
" \"uris\": [\"https://druid.apache.org/data/wikipedia.json.gz\"],\n",
" \"filter\": \"*\"\n",
" },\n",
" \"inputFormat\": {\n",
" \"type\": \"json\"\n",
" }\n",
" },\n",
" \"tuningConfig\": {\n",
" \"type\": \"index_parallel\",\n",
" \"partitionsSpec\": {\n",
" \"type\": \"dynamic\"\n",
" },\n",
" \"indexSpec\": {\n",
" \"stringDictionaryEncoding\": {\n",
" \"type\": \"frontCoded\",\n",
" \"bucketSize\": 16\n",
" }\n",
" }\n",
" },\n",
" \"dataSchema\": {\n",
" \"dataSource\": \"wikipedia\",\n",
" \"timestampSpec\": {\n",
" \"missingValue\": \"2010-01-01T00:00:00Z\"\n",
" },\n",
" \"dimensionsSpec\": {\n",
" \"dimensions\": [],\n",
" \"dimensionExclusions\": [],\n",
" \"spatialDimensions\": [],\n",
" \"useSchemaDiscovery\": True\n",
" },\n",
" \"granularitySpec\": {\n",
" \"queryGranularity\": \"none\",\n",
" \"rollup\": False\n",
" }\n",
" }\n",
" }\n",
"}\n",
"\n",
"JSON(ingestion_spec,expanded=True)\n",
"\n",
"endpoint = druid_host + '/druid/indexer/v1/task/'\n",
"response = session.post(endpoint,json = ingestion_spec)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2617af1b",
"metadata": {},
"source": [
"Note that because we've set `\"useSchemaDiscovery\": True` in the ingestion spec, even though we didn't specify any data types for the columns, they are correctly inferred. The following cell queries the information schema metadata table and displays the data types of the columns in the `wikipedia` table you just ingested."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d3bc513-8215-4299-9bf4-135ec65cae98",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"endpoint = druid_host + '/druid/v2/sql'\n",
"sql = '''\n",
"SELECT *\n",
"FROM \"INFORMATION_SCHEMA\".\"COLUMNS\"\n",
"WHERE \"TABLE_NAME\" = 'wikipedia'\n",
"'''\n",
"sql_request = {'query': sql}\n",
"json_data = session.post(endpoint, json=sql_request).json()\n",
"result_df = pd.json_normalize(json_data)\n",
"result_df.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "483c67d7",
"metadata": {},
"source": [
"As you can see, in the `DATA_TYPE` column, different data types are correctly detected. With string-based schema auto-discovery, Druid would have stored the data as `string` types."
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "08a3b808-e138-47c7-b7f1-e3a6c9f3bad3",
"metadata": {},
"source": [
"## Shuffle join\n",
"\n",
"### Make it really easy to denormalize data as part of ingestion\n",
"Before the support of shuffle join, you'll need to use another tool to prepare the data then ingest into Druid. With shuffle join support, you can do the same transformation with one query.\n",
"For example, in the query below, the user does a self-join on the wikipedia dataset. You can easily do the same query with a typical star-schema dataset. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0dc81a51-0160-4cd6-bd97-6abf60a6e7d6",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"query = '''\n",
"REPLACE INTO \"wikipedia\" OVERWRITE ALL\n",
"WITH \"wikipedia_main\" AS (SELECT *\n",
"FROM TABLE(\n",
" EXTERN(\n",
" '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
" '{\"type\":\"json\"}'\n",
" )\n",
") EXTEND (\"channel\" VARCHAR, \"timestamp\" VARCHAR,\"user\" VARCHAR))\n",
",\n",
"\"wikipedia_dim\" AS (SELECT *\n",
"FROM TABLE(\n",
" EXTERN(\n",
" '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
" '{\"type\":\"json\"}'\n",
" )\n",
") EXTEND (\"timestamp\" VARCHAR,\"user\" VARCHAR,\"comment\" VARCHAR, \"commentLength\" BIGINT, \"cityName\" VARCHAR, \"countryName\" VARCHAR))\n",
"\n",
"\n",
"SELECT\n",
" TIME_PARSE(\"wikipedia_main\".\"timestamp\") AS \"__time\",\n",
" \"wikipedia_main\".*,\n",
" \"wikipedia_dim\".*\n",
"FROM \"wikipedia_main\"\n",
"LEFT JOIN \"wikipedia_dim\" \n",
"ON \n",
"\"wikipedia_main\".\"user\" = \"wikipedia_dim\".\"user\"\n",
"AND \n",
"\"wikipedia_main\".\"timestamp\" = \"wikipedia_dim\".\"timestamp\"\n",
"\n",
"PARTITIONED BY MONTH\n",
"'''"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "e10df053-2729-4e2c-ac4a-3c8d0c070dc0",
"metadata": {},
"source": [
"### Let's watch the ingestion task run...\n",
"Submit the preceding query and monitor the ingestion job by running the following cells. This may take a while. You can check the status of the ingestion task in the [web console](http://localhost:8888/unified-console.html#ingestion)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d302e43-9f14-4d19-b286-7a3cbc448470",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# This block submits the ingestion query\n",
"sql_request={'query': query}\n",
"endpoint = druid_host + '/druid/v2/sql/task'\n",
"response = session.post(endpoint, json=sql_request)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eadf05f7-bc0a-4a29-981d-d8bc5fd72314",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# This block monitors the ingestion query (Takes about 25-35 seconds)\n",
"ingestion_taskId = response.json()['taskId']\n",
"endpoint = druid_host + f\"/druid/indexer/v1/task/{ingestion_taskId}/status\"\n",
"import time\n",
"\n",
"json = session.get(endpoint).json()\n",
"ingestion_status = json['status']['status']\n",
" \n",
"print(\"The ingestion is running...\")\n",
"\n",
"while ingestion_status == \"RUNNING\":\n",
" time.sleep(1)\n",
" json = session.get(endpoint).json()\n",
" ingestion_status = json['status']['status']\n",
" print('.', end='')\n",
"\n",
"if ingestion_status == \"SUCCESS\": \n",
" print(\"\\nThe ingestion is complete\")\n",
"else:\n",
" print(\"\\nThe ingestion task failed:\", json)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "10417469-b2f7-4a56-bd4f-fddc0277c3c9",
"metadata": {},
"source": [
"### Note I didn't use any other tools, this is all done within Druid. No need for using Spark/Presto for data prep"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "7b134ef2-e3ef-4345-94c8-64cf36f6adfe",
"metadata": {
"tags": []
},
"source": [
"## UNNEST and arrays\n",
"\n",
"UNNEST is useful to deal with Array data and allows you to \"explode\" an array into individual rows.\n",
"\n",
"In this example, we are looking at an array of tags, which includes `almond`, `blue_berry` and `muffin`. We can use UNNEST to explode the array into individual rows, and then perform a GROUP BY on the tags."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "434602dd-d62b-476f-b18f-4a3fa23ff70e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"endpoint = druid_host + '/druid/v2/sql'\n",
"sql = '''\n",
"SELECT 'post_id_123' AS \"POST_ID\", ARRAY['almond','blue_berry','muffin'] as \"Tags\"\n",
"'''\n",
"sql_request = {'query': sql}\n",
"json_data = session.post(endpoint, json=sql_request).json()\n",
"result_df = pd.json_normalize(json_data)\n",
"result_df.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "c5d8e9a1-194a-4fc8-9759-863672271565",
"metadata": {},
"source": [
"For more examples and details on UNNEST, see [Unnest arrays within a column](https://druid.apache.org/docs/latest/tutorials/tutorial-unnest-arrays.html)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b7d80ad-e7a0-4e4b-a926-177112dc9c93",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"endpoint = druid_host + '/druid/v2/sql'\n",
"sql = '''SELECT 'post_id_123' as \"POST_ID\", * FROM UNNEST(ARRAY['almond','blue_berry','muffin']) \n",
"'''\n",
"sql_request = {'query': sql, 'context':{'enableUnnest': 'true'}}\n",
"json_data = session.post(endpoint, json=sql_request).json()\n",
"JSON(json_data)\n",
"result_df = pd.json_normalize(json_data)\n",
"result_df.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "9fdf81d2",
"metadata": {},
"source": [
"Well, you've made it this far, try out some of the new features and let us know what you think!"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,27 @@
# Jupyter Notebook tutorials for Druid
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->
This directory contains notebook-based release notes that contain examples on how to use new features.
Notebooks in this directory are meant to be run against quickstart clusters, but you can adapt them to run against live production clusters.
For information on prerequisites and getting started with the Jupyter-based tutorials,
see [Jupyter Notebook tutorials](../../../docs/tutorials/tutorial-jupyter-index.md).

View File

@ -28,7 +28,7 @@
<parent>
<groupId>org.apache.druid</groupId>
<artifactId>druid</artifactId>
<version>27.0.0-SNAPSHOT</version>
<version>28.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@ -24,7 +24,7 @@
<parent>
<groupId>org.apache.druid</groupId>
<artifactId>druid</artifactId>
<version>27.0.0-SNAPSHOT</version>
<version>28.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

Some files were not shown because too many files have changed in this diff Show More